├── COPYING
├── COPYING.LESSER
├── README
├── collectors
├── 0
│ ├── dfstat.py
│ ├── elasticsearch.py
│ ├── hadoop_datanode_jmx.py
│ ├── hbase_regionserver_jmx.py
│ ├── ifstat.py
│ ├── iostat.py
│ ├── mysql.py
│ ├── netstat.py
│ ├── procnettcp.py
│ ├── procstats.py
│ ├── redis-stats.py
│ ├── riak.py
│ ├── zfsiostats.py
│ └── zfskernstats.py
├── etc
│ ├── config.py
│ └── mysqlconf.py
└── lib
│ └── jmx-1.0.jar
├── startstop
├── stumbleupon
├── monitoring
│ ├── .gitignore
│ ├── Makefile
│ └── jmx.java
└── tcollector.pp
└── tcollector.py
/COPYING:
--------------------------------------------------------------------------------
1 | GNU GENERAL PUBLIC LICENSE
2 | Version 3, 29 June 2007
3 |
4 | Copyright (C) 2007 Free Software Foundation, Inc.
5 | Everyone is permitted to copy and distribute verbatim copies
6 | of this license document, but changing it is not allowed.
7 |
8 | Preamble
9 |
10 | The GNU General Public License is a free, copyleft license for
11 | software and other kinds of works.
12 |
13 | The licenses for most software and other practical works are designed
14 | to take away your freedom to share and change the works. By contrast,
15 | the GNU General Public License is intended to guarantee your freedom to
16 | share and change all versions of a program--to make sure it remains free
17 | software for all its users. We, the Free Software Foundation, use the
18 | GNU General Public License for most of our software; it applies also to
19 | any other work released this way by its authors. You can apply it to
20 | your programs, too.
21 |
22 | When we speak of free software, we are referring to freedom, not
23 | price. Our General Public Licenses are designed to make sure that you
24 | have the freedom to distribute copies of free software (and charge for
25 | them if you wish), that you receive source code or can get it if you
26 | want it, that you can change the software or use pieces of it in new
27 | free programs, and that you know you can do these things.
28 |
29 | To protect your rights, we need to prevent others from denying you
30 | these rights or asking you to surrender the rights. Therefore, you have
31 | certain responsibilities if you distribute copies of the software, or if
32 | you modify it: responsibilities to respect the freedom of others.
33 |
34 | For example, if you distribute copies of such a program, whether
35 | gratis or for a fee, you must pass on to the recipients the same
36 | freedoms that you received. You must make sure that they, too, receive
37 | or can get the source code. And you must show them these terms so they
38 | know their rights.
39 |
40 | Developers that use the GNU GPL protect your rights with two steps:
41 | (1) assert copyright on the software, and (2) offer you this License
42 | giving you legal permission to copy, distribute and/or modify it.
43 |
44 | For the developers' and authors' protection, the GPL clearly explains
45 | that there is no warranty for this free software. For both users' and
46 | authors' sake, the GPL requires that modified versions be marked as
47 | changed, so that their problems will not be attributed erroneously to
48 | authors of previous versions.
49 |
50 | Some devices are designed to deny users access to install or run
51 | modified versions of the software inside them, although the manufacturer
52 | can do so. This is fundamentally incompatible with the aim of
53 | protecting users' freedom to change the software. The systematic
54 | pattern of such abuse occurs in the area of products for individuals to
55 | use, which is precisely where it is most unacceptable. Therefore, we
56 | have designed this version of the GPL to prohibit the practice for those
57 | products. If such problems arise substantially in other domains, we
58 | stand ready to extend this provision to those domains in future versions
59 | of the GPL, as needed to protect the freedom of users.
60 |
61 | Finally, every program is threatened constantly by software patents.
62 | States should not allow patents to restrict development and use of
63 | software on general-purpose computers, but in those that do, we wish to
64 | avoid the special danger that patents applied to a free program could
65 | make it effectively proprietary. To prevent this, the GPL assures that
66 | patents cannot be used to render the program non-free.
67 |
68 | The precise terms and conditions for copying, distribution and
69 | modification follow.
70 |
71 | TERMS AND CONDITIONS
72 |
73 | 0. Definitions.
74 |
75 | "This License" refers to version 3 of the GNU General Public License.
76 |
77 | "Copyright" also means copyright-like laws that apply to other kinds of
78 | works, such as semiconductor masks.
79 |
80 | "The Program" refers to any copyrightable work licensed under this
81 | License. Each licensee is addressed as "you". "Licensees" and
82 | "recipients" may be individuals or organizations.
83 |
84 | To "modify" a work means to copy from or adapt all or part of the work
85 | in a fashion requiring copyright permission, other than the making of an
86 | exact copy. The resulting work is called a "modified version" of the
87 | earlier work or a work "based on" the earlier work.
88 |
89 | A "covered work" means either the unmodified Program or a work based
90 | on the Program.
91 |
92 | To "propagate" a work means to do anything with it that, without
93 | permission, would make you directly or secondarily liable for
94 | infringement under applicable copyright law, except executing it on a
95 | computer or modifying a private copy. Propagation includes copying,
96 | distribution (with or without modification), making available to the
97 | public, and in some countries other activities as well.
98 |
99 | To "convey" a work means any kind of propagation that enables other
100 | parties to make or receive copies. Mere interaction with a user through
101 | a computer network, with no transfer of a copy, is not conveying.
102 |
103 | An interactive user interface displays "Appropriate Legal Notices"
104 | to the extent that it includes a convenient and prominently visible
105 | feature that (1) displays an appropriate copyright notice, and (2)
106 | tells the user that there is no warranty for the work (except to the
107 | extent that warranties are provided), that licensees may convey the
108 | work under this License, and how to view a copy of this License. If
109 | the interface presents a list of user commands or options, such as a
110 | menu, a prominent item in the list meets this criterion.
111 |
112 | 1. Source Code.
113 |
114 | The "source code" for a work means the preferred form of the work
115 | for making modifications to it. "Object code" means any non-source
116 | form of a work.
117 |
118 | A "Standard Interface" means an interface that either is an official
119 | standard defined by a recognized standards body, or, in the case of
120 | interfaces specified for a particular programming language, one that
121 | is widely used among developers working in that language.
122 |
123 | The "System Libraries" of an executable work include anything, other
124 | than the work as a whole, that (a) is included in the normal form of
125 | packaging a Major Component, but which is not part of that Major
126 | Component, and (b) serves only to enable use of the work with that
127 | Major Component, or to implement a Standard Interface for which an
128 | implementation is available to the public in source code form. A
129 | "Major Component", in this context, means a major essential component
130 | (kernel, window system, and so on) of the specific operating system
131 | (if any) on which the executable work runs, or a compiler used to
132 | produce the work, or an object code interpreter used to run it.
133 |
134 | The "Corresponding Source" for a work in object code form means all
135 | the source code needed to generate, install, and (for an executable
136 | work) run the object code and to modify the work, including scripts to
137 | control those activities. However, it does not include the work's
138 | System Libraries, or general-purpose tools or generally available free
139 | programs which are used unmodified in performing those activities but
140 | which are not part of the work. For example, Corresponding Source
141 | includes interface definition files associated with source files for
142 | the work, and the source code for shared libraries and dynamically
143 | linked subprograms that the work is specifically designed to require,
144 | such as by intimate data communication or control flow between those
145 | subprograms and other parts of the work.
146 |
147 | The Corresponding Source need not include anything that users
148 | can regenerate automatically from other parts of the Corresponding
149 | Source.
150 |
151 | The Corresponding Source for a work in source code form is that
152 | same work.
153 |
154 | 2. Basic Permissions.
155 |
156 | All rights granted under this License are granted for the term of
157 | copyright on the Program, and are irrevocable provided the stated
158 | conditions are met. This License explicitly affirms your unlimited
159 | permission to run the unmodified Program. The output from running a
160 | covered work is covered by this License only if the output, given its
161 | content, constitutes a covered work. This License acknowledges your
162 | rights of fair use or other equivalent, as provided by copyright law.
163 |
164 | You may make, run and propagate covered works that you do not
165 | convey, without conditions so long as your license otherwise remains
166 | in force. You may convey covered works to others for the sole purpose
167 | of having them make modifications exclusively for you, or provide you
168 | with facilities for running those works, provided that you comply with
169 | the terms of this License in conveying all material for which you do
170 | not control copyright. Those thus making or running the covered works
171 | for you must do so exclusively on your behalf, under your direction
172 | and control, on terms that prohibit them from making any copies of
173 | your copyrighted material outside their relationship with you.
174 |
175 | Conveying under any other circumstances is permitted solely under
176 | the conditions stated below. Sublicensing is not allowed; section 10
177 | makes it unnecessary.
178 |
179 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
180 |
181 | No covered work shall be deemed part of an effective technological
182 | measure under any applicable law fulfilling obligations under article
183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
184 | similar laws prohibiting or restricting circumvention of such
185 | measures.
186 |
187 | When you convey a covered work, you waive any legal power to forbid
188 | circumvention of technological measures to the extent such circumvention
189 | is effected by exercising rights under this License with respect to
190 | the covered work, and you disclaim any intention to limit operation or
191 | modification of the work as a means of enforcing, against the work's
192 | users, your or third parties' legal rights to forbid circumvention of
193 | technological measures.
194 |
195 | 4. Conveying Verbatim Copies.
196 |
197 | You may convey verbatim copies of the Program's source code as you
198 | receive it, in any medium, provided that you conspicuously and
199 | appropriately publish on each copy an appropriate copyright notice;
200 | keep intact all notices stating that this License and any
201 | non-permissive terms added in accord with section 7 apply to the code;
202 | keep intact all notices of the absence of any warranty; and give all
203 | recipients a copy of this License along with the Program.
204 |
205 | You may charge any price or no price for each copy that you convey,
206 | and you may offer support or warranty protection for a fee.
207 |
208 | 5. Conveying Modified Source Versions.
209 |
210 | You may convey a work based on the Program, or the modifications to
211 | produce it from the Program, in the form of source code under the
212 | terms of section 4, provided that you also meet all of these conditions:
213 |
214 | a) The work must carry prominent notices stating that you modified
215 | it, and giving a relevant date.
216 |
217 | b) The work must carry prominent notices stating that it is
218 | released under this License and any conditions added under section
219 | 7. This requirement modifies the requirement in section 4 to
220 | "keep intact all notices".
221 |
222 | c) You must license the entire work, as a whole, under this
223 | License to anyone who comes into possession of a copy. This
224 | License will therefore apply, along with any applicable section 7
225 | additional terms, to the whole of the work, and all its parts,
226 | regardless of how they are packaged. This License gives no
227 | permission to license the work in any other way, but it does not
228 | invalidate such permission if you have separately received it.
229 |
230 | d) If the work has interactive user interfaces, each must display
231 | Appropriate Legal Notices; however, if the Program has interactive
232 | interfaces that do not display Appropriate Legal Notices, your
233 | work need not make them do so.
234 |
235 | A compilation of a covered work with other separate and independent
236 | works, which are not by their nature extensions of the covered work,
237 | and which are not combined with it such as to form a larger program,
238 | in or on a volume of a storage or distribution medium, is called an
239 | "aggregate" if the compilation and its resulting copyright are not
240 | used to limit the access or legal rights of the compilation's users
241 | beyond what the individual works permit. Inclusion of a covered work
242 | in an aggregate does not cause this License to apply to the other
243 | parts of the aggregate.
244 |
245 | 6. Conveying Non-Source Forms.
246 |
247 | You may convey a covered work in object code form under the terms
248 | of sections 4 and 5, provided that you also convey the
249 | machine-readable Corresponding Source under the terms of this License,
250 | in one of these ways:
251 |
252 | a) Convey the object code in, or embodied in, a physical product
253 | (including a physical distribution medium), accompanied by the
254 | Corresponding Source fixed on a durable physical medium
255 | customarily used for software interchange.
256 |
257 | b) Convey the object code in, or embodied in, a physical product
258 | (including a physical distribution medium), accompanied by a
259 | written offer, valid for at least three years and valid for as
260 | long as you offer spare parts or customer support for that product
261 | model, to give anyone who possesses the object code either (1) a
262 | copy of the Corresponding Source for all the software in the
263 | product that is covered by this License, on a durable physical
264 | medium customarily used for software interchange, for a price no
265 | more than your reasonable cost of physically performing this
266 | conveying of source, or (2) access to copy the
267 | Corresponding Source from a network server at no charge.
268 |
269 | c) Convey individual copies of the object code with a copy of the
270 | written offer to provide the Corresponding Source. This
271 | alternative is allowed only occasionally and noncommercially, and
272 | only if you received the object code with such an offer, in accord
273 | with subsection 6b.
274 |
275 | d) Convey the object code by offering access from a designated
276 | place (gratis or for a charge), and offer equivalent access to the
277 | Corresponding Source in the same way through the same place at no
278 | further charge. You need not require recipients to copy the
279 | Corresponding Source along with the object code. If the place to
280 | copy the object code is a network server, the Corresponding Source
281 | may be on a different server (operated by you or a third party)
282 | that supports equivalent copying facilities, provided you maintain
283 | clear directions next to the object code saying where to find the
284 | Corresponding Source. Regardless of what server hosts the
285 | Corresponding Source, you remain obligated to ensure that it is
286 | available for as long as needed to satisfy these requirements.
287 |
288 | e) Convey the object code using peer-to-peer transmission, provided
289 | you inform other peers where the object code and Corresponding
290 | Source of the work are being offered to the general public at no
291 | charge under subsection 6d.
292 |
293 | A separable portion of the object code, whose source code is excluded
294 | from the Corresponding Source as a System Library, need not be
295 | included in conveying the object code work.
296 |
297 | A "User Product" is either (1) a "consumer product", which means any
298 | tangible personal property which is normally used for personal, family,
299 | or household purposes, or (2) anything designed or sold for incorporation
300 | into a dwelling. In determining whether a product is a consumer product,
301 | doubtful cases shall be resolved in favor of coverage. For a particular
302 | product received by a particular user, "normally used" refers to a
303 | typical or common use of that class of product, regardless of the status
304 | of the particular user or of the way in which the particular user
305 | actually uses, or expects or is expected to use, the product. A product
306 | is a consumer product regardless of whether the product has substantial
307 | commercial, industrial or non-consumer uses, unless such uses represent
308 | the only significant mode of use of the product.
309 |
310 | "Installation Information" for a User Product means any methods,
311 | procedures, authorization keys, or other information required to install
312 | and execute modified versions of a covered work in that User Product from
313 | a modified version of its Corresponding Source. The information must
314 | suffice to ensure that the continued functioning of the modified object
315 | code is in no case prevented or interfered with solely because
316 | modification has been made.
317 |
318 | If you convey an object code work under this section in, or with, or
319 | specifically for use in, a User Product, and the conveying occurs as
320 | part of a transaction in which the right of possession and use of the
321 | User Product is transferred to the recipient in perpetuity or for a
322 | fixed term (regardless of how the transaction is characterized), the
323 | Corresponding Source conveyed under this section must be accompanied
324 | by the Installation Information. But this requirement does not apply
325 | if neither you nor any third party retains the ability to install
326 | modified object code on the User Product (for example, the work has
327 | been installed in ROM).
328 |
329 | The requirement to provide Installation Information does not include a
330 | requirement to continue to provide support service, warranty, or updates
331 | for a work that has been modified or installed by the recipient, or for
332 | the User Product in which it has been modified or installed. Access to a
333 | network may be denied when the modification itself materially and
334 | adversely affects the operation of the network or violates the rules and
335 | protocols for communication across the network.
336 |
337 | Corresponding Source conveyed, and Installation Information provided,
338 | in accord with this section must be in a format that is publicly
339 | documented (and with an implementation available to the public in
340 | source code form), and must require no special password or key for
341 | unpacking, reading or copying.
342 |
343 | 7. Additional Terms.
344 |
345 | "Additional permissions" are terms that supplement the terms of this
346 | License by making exceptions from one or more of its conditions.
347 | Additional permissions that are applicable to the entire Program shall
348 | be treated as though they were included in this License, to the extent
349 | that they are valid under applicable law. If additional permissions
350 | apply only to part of the Program, that part may be used separately
351 | under those permissions, but the entire Program remains governed by
352 | this License without regard to the additional permissions.
353 |
354 | When you convey a copy of a covered work, you may at your option
355 | remove any additional permissions from that copy, or from any part of
356 | it. (Additional permissions may be written to require their own
357 | removal in certain cases when you modify the work.) You may place
358 | additional permissions on material, added by you to a covered work,
359 | for which you have or can give appropriate copyright permission.
360 |
361 | Notwithstanding any other provision of this License, for material you
362 | add to a covered work, you may (if authorized by the copyright holders of
363 | that material) supplement the terms of this License with terms:
364 |
365 | a) Disclaiming warranty or limiting liability differently from the
366 | terms of sections 15 and 16 of this License; or
367 |
368 | b) Requiring preservation of specified reasonable legal notices or
369 | author attributions in that material or in the Appropriate Legal
370 | Notices displayed by works containing it; or
371 |
372 | c) Prohibiting misrepresentation of the origin of that material, or
373 | requiring that modified versions of such material be marked in
374 | reasonable ways as different from the original version; or
375 |
376 | d) Limiting the use for publicity purposes of names of licensors or
377 | authors of the material; or
378 |
379 | e) Declining to grant rights under trademark law for use of some
380 | trade names, trademarks, or service marks; or
381 |
382 | f) Requiring indemnification of licensors and authors of that
383 | material by anyone who conveys the material (or modified versions of
384 | it) with contractual assumptions of liability to the recipient, for
385 | any liability that these contractual assumptions directly impose on
386 | those licensors and authors.
387 |
388 | All other non-permissive additional terms are considered "further
389 | restrictions" within the meaning of section 10. If the Program as you
390 | received it, or any part of it, contains a notice stating that it is
391 | governed by this License along with a term that is a further
392 | restriction, you may remove that term. If a license document contains
393 | a further restriction but permits relicensing or conveying under this
394 | License, you may add to a covered work material governed by the terms
395 | of that license document, provided that the further restriction does
396 | not survive such relicensing or conveying.
397 |
398 | If you add terms to a covered work in accord with this section, you
399 | must place, in the relevant source files, a statement of the
400 | additional terms that apply to those files, or a notice indicating
401 | where to find the applicable terms.
402 |
403 | Additional terms, permissive or non-permissive, may be stated in the
404 | form of a separately written license, or stated as exceptions;
405 | the above requirements apply either way.
406 |
407 | 8. Termination.
408 |
409 | You may not propagate or modify a covered work except as expressly
410 | provided under this License. Any attempt otherwise to propagate or
411 | modify it is void, and will automatically terminate your rights under
412 | this License (including any patent licenses granted under the third
413 | paragraph of section 11).
414 |
415 | However, if you cease all violation of this License, then your
416 | license from a particular copyright holder is reinstated (a)
417 | provisionally, unless and until the copyright holder explicitly and
418 | finally terminates your license, and (b) permanently, if the copyright
419 | holder fails to notify you of the violation by some reasonable means
420 | prior to 60 days after the cessation.
421 |
422 | Moreover, your license from a particular copyright holder is
423 | reinstated permanently if the copyright holder notifies you of the
424 | violation by some reasonable means, this is the first time you have
425 | received notice of violation of this License (for any work) from that
426 | copyright holder, and you cure the violation prior to 30 days after
427 | your receipt of the notice.
428 |
429 | Termination of your rights under this section does not terminate the
430 | licenses of parties who have received copies or rights from you under
431 | this License. If your rights have been terminated and not permanently
432 | reinstated, you do not qualify to receive new licenses for the same
433 | material under section 10.
434 |
435 | 9. Acceptance Not Required for Having Copies.
436 |
437 | You are not required to accept this License in order to receive or
438 | run a copy of the Program. Ancillary propagation of a covered work
439 | occurring solely as a consequence of using peer-to-peer transmission
440 | to receive a copy likewise does not require acceptance. However,
441 | nothing other than this License grants you permission to propagate or
442 | modify any covered work. These actions infringe copyright if you do
443 | not accept this License. Therefore, by modifying or propagating a
444 | covered work, you indicate your acceptance of this License to do so.
445 |
446 | 10. Automatic Licensing of Downstream Recipients.
447 |
448 | Each time you convey a covered work, the recipient automatically
449 | receives a license from the original licensors, to run, modify and
450 | propagate that work, subject to this License. You are not responsible
451 | for enforcing compliance by third parties with this License.
452 |
453 | An "entity transaction" is a transaction transferring control of an
454 | organization, or substantially all assets of one, or subdividing an
455 | organization, or merging organizations. If propagation of a covered
456 | work results from an entity transaction, each party to that
457 | transaction who receives a copy of the work also receives whatever
458 | licenses to the work the party's predecessor in interest had or could
459 | give under the previous paragraph, plus a right to possession of the
460 | Corresponding Source of the work from the predecessor in interest, if
461 | the predecessor has it or can get it with reasonable efforts.
462 |
463 | You may not impose any further restrictions on the exercise of the
464 | rights granted or affirmed under this License. For example, you may
465 | not impose a license fee, royalty, or other charge for exercise of
466 | rights granted under this License, and you may not initiate litigation
467 | (including a cross-claim or counterclaim in a lawsuit) alleging that
468 | any patent claim is infringed by making, using, selling, offering for
469 | sale, or importing the Program or any portion of it.
470 |
471 | 11. Patents.
472 |
473 | A "contributor" is a copyright holder who authorizes use under this
474 | License of the Program or a work on which the Program is based. The
475 | work thus licensed is called the contributor's "contributor version".
476 |
477 | A contributor's "essential patent claims" are all patent claims
478 | owned or controlled by the contributor, whether already acquired or
479 | hereafter acquired, that would be infringed by some manner, permitted
480 | by this License, of making, using, or selling its contributor version,
481 | but do not include claims that would be infringed only as a
482 | consequence of further modification of the contributor version. For
483 | purposes of this definition, "control" includes the right to grant
484 | patent sublicenses in a manner consistent with the requirements of
485 | this License.
486 |
487 | Each contributor grants you a non-exclusive, worldwide, royalty-free
488 | patent license under the contributor's essential patent claims, to
489 | make, use, sell, offer for sale, import and otherwise run, modify and
490 | propagate the contents of its contributor version.
491 |
492 | In the following three paragraphs, a "patent license" is any express
493 | agreement or commitment, however denominated, not to enforce a patent
494 | (such as an express permission to practice a patent or covenant not to
495 | sue for patent infringement). To "grant" such a patent license to a
496 | party means to make such an agreement or commitment not to enforce a
497 | patent against the party.
498 |
499 | If you convey a covered work, knowingly relying on a patent license,
500 | and the Corresponding Source of the work is not available for anyone
501 | to copy, free of charge and under the terms of this License, through a
502 | publicly available network server or other readily accessible means,
503 | then you must either (1) cause the Corresponding Source to be so
504 | available, or (2) arrange to deprive yourself of the benefit of the
505 | patent license for this particular work, or (3) arrange, in a manner
506 | consistent with the requirements of this License, to extend the patent
507 | license to downstream recipients. "Knowingly relying" means you have
508 | actual knowledge that, but for the patent license, your conveying the
509 | covered work in a country, or your recipient's use of the covered work
510 | in a country, would infringe one or more identifiable patents in that
511 | country that you have reason to believe are valid.
512 |
513 | If, pursuant to or in connection with a single transaction or
514 | arrangement, you convey, or propagate by procuring conveyance of, a
515 | covered work, and grant a patent license to some of the parties
516 | receiving the covered work authorizing them to use, propagate, modify
517 | or convey a specific copy of the covered work, then the patent license
518 | you grant is automatically extended to all recipients of the covered
519 | work and works based on it.
520 |
521 | A patent license is "discriminatory" if it does not include within
522 | the scope of its coverage, prohibits the exercise of, or is
523 | conditioned on the non-exercise of one or more of the rights that are
524 | specifically granted under this License. You may not convey a covered
525 | work if you are a party to an arrangement with a third party that is
526 | in the business of distributing software, under which you make payment
527 | to the third party based on the extent of your activity of conveying
528 | the work, and under which the third party grants, to any of the
529 | parties who would receive the covered work from you, a discriminatory
530 | patent license (a) in connection with copies of the covered work
531 | conveyed by you (or copies made from those copies), or (b) primarily
532 | for and in connection with specific products or compilations that
533 | contain the covered work, unless you entered into that arrangement,
534 | or that patent license was granted, prior to 28 March 2007.
535 |
536 | Nothing in this License shall be construed as excluding or limiting
537 | any implied license or other defenses to infringement that may
538 | otherwise be available to you under applicable patent law.
539 |
540 | 12. No Surrender of Others' Freedom.
541 |
542 | If conditions are imposed on you (whether by court order, agreement or
543 | otherwise) that contradict the conditions of this License, they do not
544 | excuse you from the conditions of this License. If you cannot convey a
545 | covered work so as to satisfy simultaneously your obligations under this
546 | License and any other pertinent obligations, then as a consequence you may
547 | not convey it at all. For example, if you agree to terms that obligate you
548 | to collect a royalty for further conveying from those to whom you convey
549 | the Program, the only way you could satisfy both those terms and this
550 | License would be to refrain entirely from conveying the Program.
551 |
552 | 13. Use with the GNU Affero General Public License.
553 |
554 | Notwithstanding any other provision of this License, you have
555 | permission to link or combine any covered work with a work licensed
556 | under version 3 of the GNU Affero General Public License into a single
557 | combined work, and to convey the resulting work. The terms of this
558 | License will continue to apply to the part which is the covered work,
559 | but the special requirements of the GNU Affero General Public License,
560 | section 13, concerning interaction through a network will apply to the
561 | combination as such.
562 |
563 | 14. Revised Versions of this License.
564 |
565 | The Free Software Foundation may publish revised and/or new versions of
566 | the GNU General Public License from time to time. Such new versions will
567 | be similar in spirit to the present version, but may differ in detail to
568 | address new problems or concerns.
569 |
570 | Each version is given a distinguishing version number. If the
571 | Program specifies that a certain numbered version of the GNU General
572 | Public License "or any later version" applies to it, you have the
573 | option of following the terms and conditions either of that numbered
574 | version or of any later version published by the Free Software
575 | Foundation. If the Program does not specify a version number of the
576 | GNU General Public License, you may choose any version ever published
577 | by the Free Software Foundation.
578 |
579 | If the Program specifies that a proxy can decide which future
580 | versions of the GNU General Public License can be used, that proxy's
581 | public statement of acceptance of a version permanently authorizes you
582 | to choose that version for the Program.
583 |
584 | Later license versions may give you additional or different
585 | permissions. However, no additional obligations are imposed on any
586 | author or copyright holder as a result of your choosing to follow a
587 | later version.
588 |
589 | 15. Disclaimer of Warranty.
590 |
591 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
592 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
596 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
597 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
599 |
600 | 16. Limitation of Liability.
601 |
602 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
610 | SUCH DAMAGES.
611 |
612 | 17. Interpretation of Sections 15 and 16.
613 |
614 | If the disclaimer of warranty and limitation of liability provided
615 | above cannot be given local legal effect according to their terms,
616 | reviewing courts shall apply local law that most closely approximates
617 | an absolute waiver of all civil liability in connection with the
618 | Program, unless a warranty or assumption of liability accompanies a
619 | copy of the Program in return for a fee.
620 |
621 | END OF TERMS AND CONDITIONS
622 |
623 | How to Apply These Terms to Your New Programs
624 |
625 | If you develop a new program, and you want it to be of the greatest
626 | possible use to the public, the best way to achieve this is to make it
627 | free software which everyone can redistribute and change under these terms.
628 |
629 | To do so, attach the following notices to the program. It is safest
630 | to attach them to the start of each source file to most effectively
631 | state the exclusion of warranty; and each file should have at least
632 | the "copyright" line and a pointer to where the full notice is found.
633 |
634 |
635 | Copyright (C)
636 |
637 | This program is free software: you can redistribute it and/or modify
638 | it under the terms of the GNU General Public License as published by
639 | the Free Software Foundation, either version 3 of the License, or
640 | (at your option) any later version.
641 |
642 | This program is distributed in the hope that it will be useful,
643 | but WITHOUT ANY WARRANTY; without even the implied warranty of
644 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
645 | GNU General Public License for more details.
646 |
647 | You should have received a copy of the GNU General Public License
648 | along with this program. If not, see .
649 |
650 | Also add information on how to contact you by electronic and paper mail.
651 |
652 | If the program does terminal interaction, make it output a short
653 | notice like this when it starts in an interactive mode:
654 |
655 | Copyright (C)
656 | This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
657 | This is free software, and you are welcome to redistribute it
658 | under certain conditions; type `show c' for details.
659 |
660 | The hypothetical commands `show w' and `show c' should show the appropriate
661 | parts of the General Public License. Of course, your program's commands
662 | might be different; for a GUI interface, you would use an "about box".
663 |
664 | You should also get your employer (if you work as a programmer) or school,
665 | if any, to sign a "copyright disclaimer" for the program, if necessary.
666 | For more information on this, and how to apply and follow the GNU GPL, see
667 | .
668 |
669 | The GNU General Public License does not permit incorporating your program
670 | into proprietary programs. If your program is a subroutine library, you
671 | may consider it more useful to permit linking proprietary applications with
672 | the library. If this is what you want to do, use the GNU Lesser General
673 | Public License instead of this License. But first, please read
674 | .
675 |
--------------------------------------------------------------------------------
/COPYING.LESSER:
--------------------------------------------------------------------------------
1 | GNU LESSER GENERAL PUBLIC LICENSE
2 | Version 3, 29 June 2007
3 |
4 | Copyright (C) 2007 Free Software Foundation, Inc.
5 | Everyone is permitted to copy and distribute verbatim copies
6 | of this license document, but changing it is not allowed.
7 |
8 |
9 | This version of the GNU Lesser General Public License incorporates
10 | the terms and conditions of version 3 of the GNU General Public
11 | License, supplemented by the additional permissions listed below.
12 |
13 | 0. Additional Definitions.
14 |
15 | As used herein, "this License" refers to version 3 of the GNU Lesser
16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU
17 | General Public License.
18 |
19 | "The Library" refers to a covered work governed by this License,
20 | other than an Application or a Combined Work as defined below.
21 |
22 | An "Application" is any work that makes use of an interface provided
23 | by the Library, but which is not otherwise based on the Library.
24 | Defining a subclass of a class defined by the Library is deemed a mode
25 | of using an interface provided by the Library.
26 |
27 | A "Combined Work" is a work produced by combining or linking an
28 | Application with the Library. The particular version of the Library
29 | with which the Combined Work was made is also called the "Linked
30 | Version".
31 |
32 | The "Minimal Corresponding Source" for a Combined Work means the
33 | Corresponding Source for the Combined Work, excluding any source code
34 | for portions of the Combined Work that, considered in isolation, are
35 | based on the Application, and not on the Linked Version.
36 |
37 | The "Corresponding Application Code" for a Combined Work means the
38 | object code and/or source code for the Application, including any data
39 | and utility programs needed for reproducing the Combined Work from the
40 | Application, but excluding the System Libraries of the Combined Work.
41 |
42 | 1. Exception to Section 3 of the GNU GPL.
43 |
44 | You may convey a covered work under sections 3 and 4 of this License
45 | without being bound by section 3 of the GNU GPL.
46 |
47 | 2. Conveying Modified Versions.
48 |
49 | If you modify a copy of the Library, and, in your modifications, a
50 | facility refers to a function or data to be supplied by an Application
51 | that uses the facility (other than as an argument passed when the
52 | facility is invoked), then you may convey a copy of the modified
53 | version:
54 |
55 | a) under this License, provided that you make a good faith effort to
56 | ensure that, in the event an Application does not supply the
57 | function or data, the facility still operates, and performs
58 | whatever part of its purpose remains meaningful, or
59 |
60 | b) under the GNU GPL, with none of the additional permissions of
61 | this License applicable to that copy.
62 |
63 | 3. Object Code Incorporating Material from Library Header Files.
64 |
65 | The object code form of an Application may incorporate material from
66 | a header file that is part of the Library. You may convey such object
67 | code under terms of your choice, provided that, if the incorporated
68 | material is not limited to numerical parameters, data structure
69 | layouts and accessors, or small macros, inline functions and templates
70 | (ten or fewer lines in length), you do both of the following:
71 |
72 | a) Give prominent notice with each copy of the object code that the
73 | Library is used in it and that the Library and its use are
74 | covered by this License.
75 |
76 | b) Accompany the object code with a copy of the GNU GPL and this license
77 | document.
78 |
79 | 4. Combined Works.
80 |
81 | You may convey a Combined Work under terms of your choice that,
82 | taken together, effectively do not restrict modification of the
83 | portions of the Library contained in the Combined Work and reverse
84 | engineering for debugging such modifications, if you also do each of
85 | the following:
86 |
87 | a) Give prominent notice with each copy of the Combined Work that
88 | the Library is used in it and that the Library and its use are
89 | covered by this License.
90 |
91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license
92 | document.
93 |
94 | c) For a Combined Work that displays copyright notices during
95 | execution, include the copyright notice for the Library among
96 | these notices, as well as a reference directing the user to the
97 | copies of the GNU GPL and this license document.
98 |
99 | d) Do one of the following:
100 |
101 | 0) Convey the Minimal Corresponding Source under the terms of this
102 | License, and the Corresponding Application Code in a form
103 | suitable for, and under terms that permit, the user to
104 | recombine or relink the Application with a modified version of
105 | the Linked Version to produce a modified Combined Work, in the
106 | manner specified by section 6 of the GNU GPL for conveying
107 | Corresponding Source.
108 |
109 | 1) Use a suitable shared library mechanism for linking with the
110 | Library. A suitable mechanism is one that (a) uses at run time
111 | a copy of the Library already present on the user's computer
112 | system, and (b) will operate properly with a modified version
113 | of the Library that is interface-compatible with the Linked
114 | Version.
115 |
116 | e) Provide Installation Information, but only if you would otherwise
117 | be required to provide such information under section 6 of the
118 | GNU GPL, and only to the extent that such information is
119 | necessary to install and execute a modified version of the
120 | Combined Work produced by recombining or relinking the
121 | Application with a modified version of the Linked Version. (If
122 | you use option 4d0, the Installation Information must accompany
123 | the Minimal Corresponding Source and Corresponding Application
124 | Code. If you use option 4d1, you must provide the Installation
125 | Information in the manner specified by section 6 of the GNU GPL
126 | for conveying Corresponding Source.)
127 |
128 | 5. Combined Libraries.
129 |
130 | You may place library facilities that are a work based on the
131 | Library side by side in a single library together with other library
132 | facilities that are not Applications and are not covered by this
133 | License, and convey such a combined library under terms of your
134 | choice, if you do both of the following:
135 |
136 | a) Accompany the combined library with a copy of the same work based
137 | on the Library, uncombined with any other library facilities,
138 | conveyed under the terms of this License.
139 |
140 | b) Give prominent notice with the combined library that part of it
141 | is a work based on the Library, and explaining where to find the
142 | accompanying uncombined form of the same work.
143 |
144 | 6. Revised Versions of the GNU Lesser General Public License.
145 |
146 | The Free Software Foundation may publish revised and/or new versions
147 | of the GNU Lesser General Public License from time to time. Such new
148 | versions will be similar in spirit to the present version, but may
149 | differ in detail to address new problems or concerns.
150 |
151 | Each version is given a distinguishing version number. If the
152 | Library as you received it specifies that a certain numbered version
153 | of the GNU Lesser General Public License "or any later version"
154 | applies to it, you have the option of following the terms and
155 | conditions either of that published version or of any later version
156 | published by the Free Software Foundation. If the Library as you
157 | received it does not specify a version number of the GNU Lesser
158 | General Public License, you may choose any version of the GNU Lesser
159 | General Public License ever published by the Free Software Foundation.
160 |
161 | If the Library as you received it specifies that a proxy can decide
162 | whether future versions of the GNU Lesser General Public License shall
163 | apply, that proxy's public statement of acceptance of any version is
164 | permanent authorization for you to choose that version for the
165 | Library.
166 |
--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
1 | tcollector is a framework to collect data points and store them in OpenTSDB.
2 | It allows you to write simple collectors that it'll run and monitor. It also
3 | handles the communication with the TSDs.
4 |
5 | For more info, see
6 |
7 | http://www.opentsdb.net/tcollector.html
8 |
--------------------------------------------------------------------------------
/collectors/0/dfstat.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | # This file is part of tcollector.
3 | # Copyright (C) 2010 StumbleUpon, Inc.
4 | #
5 | # This program is free software: you can redistribute it and/or modify it
6 | # under the terms of the GNU Lesser General Public License as published by
7 | # the Free Software Foundation, either version 3 of the License, or (at your
8 | # option) any later version. This program is distributed in the hope that it
9 | # will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty
10 | # of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser
11 | # General Public License for more details. You should have received a copy
12 | # of the GNU Lesser General Public License along with this program. If not,
13 | # see .
14 | """df disk space and inode counts for TSDB """
15 | #
16 | # dfstat.py
17 | #
18 | # df.1kblocks.total total size of fs
19 | # df.1kblocks.used blocks used
20 | # df.1kblocks.available blocks available
21 | # df.inodes.total number of inodes
22 | # df.inodes.used number of inodes
23 | # df.inodes.free number of inodes
24 |
25 | # All metrics are tagged with mount= and fstype=
26 | # This makes it easier to exclude stuff like
27 | # tmpfs mounts from disk usage reports.
28 |
29 | # Because tsdb does not like slashes in tags, slashes will
30 | # be replaced by underscores in the mount= tag. In theory
31 | # this could cause problems if you have a mountpoint of
32 | # "/foo/bar/" and "/foo_bar/".
33 |
34 |
35 | import os
36 | import socket
37 | import subprocess
38 | import sys
39 | import time
40 |
41 |
42 | COLLECTION_INTERVAL = 60 # seconds
43 |
44 | def main():
45 | """dfstats main loop"""
46 |
47 | while True:
48 | ts = int(time.time())
49 | # 1kblocks
50 | df_proc = subprocess.Popen(["df", "-PlTk"], stdout=subprocess.PIPE)
51 | stdout, _ = df_proc.communicate()
52 | if df_proc.returncode == 0:
53 | for line in stdout.split("\n"): # pylint: disable=E1103
54 | fields = line.split()
55 | # skip header/blank lines
56 | if not line or not fields[2].isdigit():
57 | continue
58 | # Skip mounts/types we don't care about.
59 | # Most of this stuff is of type tmpfs, but we don't
60 | # want to blacklist all tmpfs since sometimes it's
61 | # used for active filesystems (/var/run, /tmp)
62 | # that we do want to track.
63 | if fields[1] in ("debugfs", "devtmpfs"):
64 | continue
65 | if fields[6] == "/dev":
66 | continue
67 | # /dev/shm, /lib/init_rw, /lib/modules, etc
68 | #if fields[6].startswith(("/lib/", "/dev/")): # python2.5+
69 | if fields[6].startswith("/lib/"):
70 | continue
71 | if fields[6].startswith("/dev/"):
72 | continue
73 |
74 | mount = fields[6]
75 | print ("df.1kblocks.total %d %s mount=%s fstype=%s"
76 | % (ts, fields[2], mount, fields[1]))
77 | print ("df.1kblocks.used %d %s mount=%s fstype=%s"
78 | % (ts, fields[3], mount, fields[1]))
79 | print ("df.1kblocks.free %d %s mount=%s fstype=%s"
80 | % (ts, fields[4], mount, fields[1]))
81 | else:
82 | print >> sys.stderr, "df -Pltk returned %r" % df_proc.returncode
83 |
84 | ts = int(time.time())
85 | # inodes
86 | df_proc = subprocess.Popen(["df", "-PlTi"], stdout=subprocess.PIPE)
87 | stdout, _ = df_proc.communicate()
88 | if df_proc.returncode == 0:
89 | for line in stdout.split("\n"): # pylint: disable=E1103
90 | fields = line.split()
91 | if not line or not fields[2].isdigit():
92 | continue
93 |
94 | mount = fields[6]
95 | print ("df.inodes.total %d %s mount=%s fstype=%s"
96 | % (ts, fields[2], mount, fields[1]))
97 | print ("df.inodes.used %d %s mount=%s fstype=%s"
98 | % (ts, fields[3], mount, fields[1]))
99 | print ("df.inodes.free %d %s mount=%s fstype=%s"
100 | % (ts, fields[4], mount, fields[1]))
101 | else:
102 | print >> sys.stderr, "df -Plti returned %r" % df_proc.returncode
103 |
104 | sys.stdout.flush()
105 | time.sleep(COLLECTION_INTERVAL)
106 |
107 | if __name__ == "__main__":
108 | main()
109 |
--------------------------------------------------------------------------------
/collectors/0/elasticsearch.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | # This file is part of tcollector.
3 | # Copyright (C) 2011 StumbleUpon, Inc.
4 | #
5 | # This program is free software: you can redistribute it and/or modify it
6 | # under the terms of the GNU Lesser General Public License as published by
7 | # the Free Software Foundation, either version 3 of the License, or (at your
8 | # option) any later version. This program is distributed in the hope that it
9 | # will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty
10 | # of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser
11 | # General Public License for more details. You should have received a copy
12 | # of the GNU Lesser General Public License along with this program. If not,
13 | # see .
14 | """ElasticSearch collector""" # Because ES is cool, bonsai cool.
15 | # Tested with ES 0.16.5 and 0.17.x
16 |
17 | import errno
18 | import httplib
19 | try:
20 | import json
21 | except ImportError:
22 | json = None # Handled gracefully in main. Not available by default in <2.6
23 | import socket
24 | import sys
25 | import time
26 |
27 |
28 | COLLECTION_INTERVAL = 15 # seconds
29 | DEFAULT_TIMEOUT = 10.0 # seconds
30 | ES_HOST = "localhost"
31 | ES_PORT = 9200 # TCP port on which ES listens.
32 |
33 | STATUS_MAP = {
34 | "green": 0,
35 | "yellow": 1,
36 | "red": 2,
37 | }
38 |
39 |
40 | def is_numeric(value):
41 | return isinstance(value, (int, long, float))
42 |
43 |
44 | def err(msg):
45 | print >>sys.stderr, msg
46 |
47 |
48 | class ESError(RuntimeError):
49 | """Exception raised if we don't get a 200 OK from ElasticSearch."""
50 |
51 | def __init__(self, resp):
52 | RuntimeError.__init__(self, str(resp))
53 | self.resp = resp
54 |
55 |
56 | def request(server, uri):
57 | """Does a GET request of the given uri on the given HTTPConnection."""
58 | server.request("GET", uri)
59 | resp = server.getresponse()
60 | if resp.status != httplib.OK:
61 | raise ESError(resp)
62 | return json.loads(resp.read())
63 |
64 |
65 | def cluster_health(server):
66 | return request(server, "/_cluster/health")
67 |
68 |
69 | def cluster_state(server):
70 | return request(server, "/_cluster/state"
71 | + "?filter_routing_table=true&filter_metadata=true&filter_blocks=true")
72 |
73 |
74 | def node_stats(server):
75 | return request(server, "/_cluster/nodes/_local/stats")
76 |
77 |
78 | def main(argv):
79 | socket.setdefaulttimeout(DEFAULT_TIMEOUT)
80 | server = httplib.HTTPConnection(ES_HOST, ES_PORT)
81 | try:
82 | server.connect()
83 | except socket.error, (erno, e):
84 | if erno == errno.ECONNREFUSED:
85 | return 13 # No ES running, ask tcollector to not respawn us.
86 | raise
87 | if json is None:
88 | err("This collector requires the `json' Python module.")
89 | return 1
90 |
91 | nstats = node_stats(server)
92 | cluster_name = nstats["cluster_name"]
93 | nodeid, nstats = nstats["nodes"].popitem()
94 |
95 | ts = None
96 | def printmetric(metric, value, **tags):
97 | if tags:
98 | tags = " " + " ".join("%s=%s" % (name, value)
99 | for name, value in tags.iteritems())
100 | else:
101 | tags = ""
102 | print ("elasticsearch.%s %d %s cluster=%s%s"
103 | % (metric, ts, value, cluster_name, tags))
104 |
105 | while True:
106 | ts = int(time.time())
107 | nstats = node_stats(server)
108 | # Check that the node's identity hasn't changed in the mean time.
109 | if nstats["cluster_name"] != cluster_name:
110 | err("cluster_name changed from %r to %r"
111 | % (cluster_name, nstats["cluster_name"]))
112 | return 1
113 | this_nodeid, nstats = nstats["nodes"].popitem()
114 | if this_nodeid != nodeid:
115 | err("node ID changed from %r to %r" % (nodeid, this_nodeid))
116 | return 1
117 |
118 | is_master = nodeid == cluster_state(server)["master_node"]
119 | printmetric("is_master", int(is_master))
120 | if is_master:
121 | ts = int(time.time()) # In case last call took a while.
122 | cstats = cluster_health(server)
123 | for stat, value in cstats.iteritems():
124 | if stat == "status":
125 | value = STATUS_MAP.get(value, -1)
126 | elif not is_numeric(value):
127 | continue
128 | printmetric("cluster." + stat, value)
129 |
130 | ts = nstats["os"]["timestamp"] / 1000 # ms -> s
131 | indices = nstats["indices"]
132 | printmetric("indices.size", indices["size_in_bytes"])
133 | printmetric("num_docs", indices["docs"]["num_docs"])
134 | d = indices["cache"]
135 | printmetric("cache.field.evictions", d["field_evictions"])
136 | printmetric("cache.field.size", d["field_size_in_bytes"])
137 | printmetric("cache.filter.count", d["filter_count"])
138 | printmetric("cache.filter.evictions", d["filter_evictions"])
139 | printmetric("cache.filter.size", d["filter_size_in_bytes"])
140 | d = indices["merges"]
141 | printmetric("merges.current", d["current"])
142 | printmetric("merges.total", d["total"])
143 | printmetric("merges.total_time", d["total_time_in_millis"] / 1000.)
144 | del indices
145 | process = nstats["process"]
146 | ts = process["timestamp"] / 1000 # ms -> s
147 | open_fds = process.get("open_file_descriptors") # ES 0.17
148 | if open_fds is None:
149 | open_fds = process.get("fd") # ES 0.16
150 | if open_fds is not None:
151 | open_fds = open_fds["total"]
152 | if open_fds is not None:
153 | printmetric("process.open_file_descriptors", open_fds)
154 | d = process["cpu"]
155 | printmetric("process.cpu.percent", d["percent"])
156 | printmetric("process.cpu.sys", d["sys_in_millis"] / 1000.)
157 | printmetric("process.cpu.user", d["user_in_millis"] / 1000.)
158 | d = process["mem"]
159 | printmetric("process.mem.resident", d["resident_in_bytes"])
160 | printmetric("process.mem.shared", d["share_in_bytes"])
161 | printmetric("process.mem.total_virtual", d["total_virtual_in_bytes"])
162 | del process
163 | jvm = nstats["jvm"]
164 | ts = jvm["timestamp"] / 1000 # ms -> s
165 | d = jvm["mem"]
166 | printmetric("jvm.mem.heap_used", d["heap_used_in_bytes"])
167 | printmetric("jvm.mem.heap_committed", d["heap_committed_in_bytes"])
168 | printmetric("jvm.mem.non_heap_used", d["non_heap_used_in_bytes"])
169 | printmetric("jvm.mem.non_heap_committed", d["non_heap_committed_in_bytes"])
170 | d = jvm["threads"]
171 | printmetric("jvm.threads.count", d["count"])
172 | printmetric("jvm.threads.peak_count", d["peak_count"])
173 | for gc, d in jvm["gc"]["collectors"].iteritems():
174 | printmetric("jvm.gc.collection_count", d["collection_count"], gc=gc)
175 | printmetric("jvm.gc.collection_time",
176 | d["collection_time_in_millis"] / 1000., gc=gc)
177 | del jvm
178 | del d
179 | for stat, value in nstats["network"]["tcp"].iteritems():
180 | if is_numeric(value):
181 | printmetric("network.tcp." + stat, value)
182 | for stat, value in nstats["transport"].iteritems():
183 | if is_numeric(value):
184 | printmetric("transport." + stat, value)
185 | # New in ES 0.17:
186 | for stat, value in nstats.get("http", {}).iteritems():
187 | if is_numeric(value):
188 | printmetric("http." + stat, value)
189 | del nstats
190 | time.sleep(COLLECTION_INTERVAL)
191 |
192 |
193 | if __name__ == "__main__":
194 | sys.exit(main(sys.argv))
195 |
--------------------------------------------------------------------------------
/collectors/0/hadoop_datanode_jmx.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | # This file is part of tcollector.
3 | # Copyright (C) 2012 StumbleUpon, Inc.
4 | #
5 | # This program is free software: you can redistribute it and/or modify it
6 | # under the terms of the GNU Lesser General Public License as published by
7 | # the Free Software Foundation, either version 3 of the License, or (at your
8 | # option) any later version. This program is distributed in the hope that it
9 | # will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty
10 | # of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser
11 | # General Public License for more details. You should have received a copy
12 | # of the GNU Lesser General Public License along with this program. If not,
13 | # see .
14 |
15 | import os
16 | import pwd
17 | import re
18 | import signal
19 | import subprocess
20 | import sys
21 | import time
22 |
23 | # If this user doesn't exist, we'll exit immediately.
24 | # If we're running as root, we'll drop privileges using this user.
25 | USER = "hadoop"
26 |
27 | # We add those files to the classpath if they exist.
28 | CLASSPATH = [
29 | "/usr/lib/jvm/java-6-sun/lib/tools.jar",
30 | ]
31 |
32 | # Map certain JVM stats so they are unique and shorter
33 | JMX_SERVICE_RENAMING = {
34 | "GarbageCollector": "datanode.gc",
35 | "OperatingSystem": "datanode.os",
36 | "Threading": "datanode.threads",
37 | }
38 |
39 | IGNORED_METRICS = set(["revision", "hdfsUser", "hdfsDate", "hdfsUrl", "date",
40 | "hdfsRevision", "user", "hdfsVersion", "url", "version",
41 | "NamenodeAddress", "Version", "RpcPort", "HttpPort",
42 | # These are useless as-is because they represent the
43 | # thread that's dedicated to serving JMX RPCs.
44 | "CurrentThreadCpuTime", "CurrentThreadUserTime",
45 | # List of directories used by the DataNode.
46 | "StorageInfo",
47 | "VolumeInfo",
48 | ])
49 |
50 | # How many times, maximum, will we attempt to restart the JMX collector.
51 | # If we reach this limit, we'll exit with an error.
52 | MAX_RESTARTS = 10
53 |
54 | TOP = False # Set to True when we want to terminate.
55 | RETVAL = 0 # Return value set by signal handler.
56 |
57 |
58 | def drop_privileges():
59 | try:
60 | ent = pwd.getpwnam(USER)
61 | except KeyError:
62 | print >>sys.stderr, "Not running, user '%s' doesn't exist" % USER
63 | sys.exit(13)
64 |
65 | if os.getuid() != 0:
66 | return
67 |
68 | os.setgid(ent.pw_gid)
69 | os.setuid(ent.pw_uid)
70 |
71 |
72 | def kill(proc):
73 | """Kills the subprocess given in argument."""
74 | # Clean up after ourselves.
75 | proc.stdout.close()
76 | rv = proc.poll()
77 | if rv is None:
78 | os.kill(proc.pid, 15)
79 | rv = proc.poll()
80 | if rv is None:
81 | os.kill(proc.pid, 9) # Bang bang!
82 | rv = proc.wait() # This shouldn't block too long.
83 | print >>sys.stderr, "warning: proc exited %d" % rv
84 | return rv
85 |
86 |
87 | def do_on_signal(signum, func, *args, **kwargs):
88 | """Calls func(*args, **kwargs) before exiting when receiving signum."""
89 | def signal_shutdown(signum, frame):
90 | print >>sys.stderr, "got signal %d, exiting" % signum
91 | func(*args, **kwargs)
92 | sys.exit(128 + signum)
93 | signal.signal(signum, signal_shutdown)
94 |
95 |
96 | def main(argv):
97 | drop_privileges()
98 | # Build the classpath.
99 | dir = os.path.dirname(sys.argv[0])
100 | jar = os.path.normpath(dir + "/../lib/jmx-1.0.jar")
101 | if not os.path.exists(jar):
102 | print >>sys.stderr, "WTF?! Can't run, %s doesn't exist" % jar
103 | return 13
104 | classpath = [jar]
105 | for jar in CLASSPATH:
106 | if os.path.exists(jar):
107 | classpath.append(jar)
108 | classpath = ":".join(classpath)
109 |
110 | jmx = subprocess.Popen(
111 | ["java", "-enableassertions", "-enablesystemassertions", # safe++
112 | "-Xmx64m", # Low RAM limit, to avoid stealing too much from prod.
113 | "-cp", classpath, "com.stumbleupon.monitoring.jmx",
114 | "--watch", "10", "--long", "--timestamp",
115 | "DataNode", # Name of the process.
116 | # The remaining arguments are pairs (mbean_regexp, attr_regexp).
117 | # The first regexp is used to match one or more MBeans, the 2nd
118 | # to match one or more attributes of the MBeans matched.
119 | "hadoop", "", # All HBase / hadoop metrics.
120 | "Threading", "Count|Time$", # Number of threads and CPU time.
121 | "OperatingSystem", "OpenFile", # Number of open files.
122 | "GarbageCollector", "Collection", # GC runs and time spent GCing.
123 | ], stdout=subprocess.PIPE, bufsize=1)
124 | do_on_signal(signal.SIGINT, kill, jmx)
125 | do_on_signal(signal.SIGPIPE, kill, jmx)
126 | do_on_signal(signal.SIGTERM, kill, jmx)
127 | try:
128 | prev_timestamp = 0
129 | while True:
130 | line = jmx.stdout.readline()
131 |
132 | if not line and jmx.poll() is not None:
133 | break # Nothing more to read and process exited.
134 | elif len(line) < 4:
135 | print >>sys.stderr, "invalid line (too short): %r" % line
136 | continue
137 |
138 | timestamp, metric, value, mbean = line.split("\t", 3)
139 | # Sanitize the timestamp.
140 | try:
141 | timestamp = int(timestamp)
142 | if timestamp < time.time() - 600:
143 | raise ValueError("timestamp too old: %d" % timestamp)
144 | if timestamp < prev_timestamp:
145 | raise ValueError("timestamp out of order: prev=%d, new=%d"
146 | % (prev_timestamp, timestamp))
147 | except ValueError, e:
148 | print >>sys.stderr, ("Invalid timestamp on line: %r -- %s"
149 | % (line, e))
150 | continue
151 | prev_timestamp = timestamp
152 |
153 | if metric in IGNORED_METRICS:
154 | continue
155 |
156 | tags = ""
157 | # The JMX metrics have per-request-type metrics like so:
158 | # metricNameNumOps
159 | # metricNameMinTime
160 | # metricNameMaxTime
161 | # metricNameAvgTime
162 | # Group related metrics together in the same metric name, use tags
163 | # to separate the different request types, so we end up with:
164 | # numOps op=metricName
165 | # avgTime op=metricName
166 | # etc, which makes it easier to graph things with the TSD.
167 | if metric.endswith("MinTime"): # We don't care about the minimum
168 | continue # time taken by operations.
169 | elif metric.endswith("NumOps"):
170 | tags = " op=" + metric[:-6]
171 | metric = "numOps"
172 | elif metric.endswith("AvgTime"):
173 | tags = " op=" + metric[:-7]
174 | metric = "avgTime"
175 | elif metric.endswith("MaxTime"):
176 | tags = " op=" + metric[:-7]
177 | metric = "maxTime"
178 |
179 | # mbean is of the form "domain:key=value,...,foo=bar"
180 | # some tags can have spaces, so we need to fix that.
181 | mbean_domain, mbean_properties = mbean.rstrip().replace(" ", "_").split(":", 1)
182 | if mbean_domain not in ("hadoop", "java.lang"):
183 | print >>sys.stderr, ("Unexpected mbean domain = %r on line %r"
184 | % (mbean_domain, line))
185 | continue
186 | mbean_properties = dict(prop.split("=", 1)
187 | for prop in mbean_properties.split(","))
188 | if mbean_domain == "hadoop":
189 | # jmx_service is HBase by default, but we can also have
190 | # RegionServer or Replication and such.
191 | jmx_service = mbean_properties.get("service", "HBase")
192 | if jmx_service == "HBase":
193 | jmx_service = "regionserver"
194 | elif mbean_domain == "java.lang":
195 | jmx_service = mbean_properties.pop("type", "jvm")
196 | if mbean_properties:
197 | tags += " " + " ".join(k + "=" + v for k, v in
198 | mbean_properties.iteritems())
199 | else:
200 | assert 0, "Should never be here"
201 |
202 | jmx_service = JMX_SERVICE_RENAMING.get(jmx_service, jmx_service)
203 | metric = jmx_service.lower() + "." + metric
204 |
205 | sys.stdout.write("hadoop.%s %d %s%s\n"
206 | % (metric, timestamp, value, tags))
207 | sys.stdout.flush()
208 | finally:
209 | kill(jmx)
210 | time.sleep(300)
211 | return 0 # Ask the tcollector to re-spawn us.
212 |
213 |
214 | if __name__ == "__main__":
215 | sys.exit(main(sys.argv))
216 |
--------------------------------------------------------------------------------
/collectors/0/hbase_regionserver_jmx.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | # This file is part of tcollector.
3 | # Copyright (C) 2010 StumbleUpon, Inc.
4 | #
5 | # This program is free software: you can redistribute it and/or modify it
6 | # under the terms of the GNU Lesser General Public License as published by
7 | # the Free Software Foundation, either version 3 of the License, or (at your
8 | # option) any later version. This program is distributed in the hope that it
9 | # will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty
10 | # of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser
11 | # General Public License for more details. You should have received a copy
12 | # of the GNU Lesser General Public License along with this program. If not,
13 | # see .
14 |
15 | import os
16 | import pwd
17 | import re
18 | import signal
19 | import subprocess
20 | import sys
21 | import time
22 | import traceback
23 |
24 | # If this user doesn't exist, we'll exit immediately.
25 | # If we're running as root, we'll drop privileges using this user.
26 | USER = "hadoop"
27 |
28 | # We add those files to the classpath if they exist.
29 | CLASSPATH = [
30 | "/usr/lib/jvm/java-6-sun/lib/tools.jar",
31 | ]
32 |
33 | # We shorten certain strings to avoid excessively long metric names.
34 | JMX_SERVICE_RENAMING = {
35 | "GarbageCollector": "gc",
36 | "OperatingSystem": "os",
37 | "Threading": "threads",
38 | # New in 0.92.1, from HBASE-5325:
39 | "org.apache.hbase": "hbase",
40 | }
41 |
42 | def drop_privileges():
43 | try:
44 | ent = pwd.getpwnam(USER)
45 | except KeyError:
46 | print >>sys.stderr, "Not running, user '%s' doesn't exist" % USER
47 | sys.exit(13)
48 |
49 | if os.getuid() != 0:
50 | return
51 |
52 | os.setgid(ent.pw_gid)
53 | os.setuid(ent.pw_uid)
54 |
55 |
56 | def kill(proc):
57 | """Kills the subprocess given in argument."""
58 | # Clean up after ourselves.
59 | proc.stdout.close()
60 | rv = proc.poll()
61 | if rv is None:
62 | os.kill(proc.pid, 15)
63 | rv = proc.poll()
64 | if rv is None:
65 | os.kill(proc.pid, 9) # Bang bang!
66 | rv = proc.wait() # This shouldn't block too long.
67 | print >>sys.stderr, "warning: proc exited %d" % rv
68 | return rv
69 |
70 |
71 | def do_on_signal(signum, func, *args, **kwargs):
72 | """Calls func(*args, **kwargs) before exiting when receiving signum."""
73 | def signal_shutdown(signum, frame):
74 | print >>sys.stderr, "got signal %d, exiting" % signum
75 | func(*args, **kwargs)
76 | sys.exit(128 + signum)
77 | signal.signal(signum, signal_shutdown)
78 |
79 |
80 | def main(argv):
81 | drop_privileges()
82 | # Build the classpath.
83 | dir = os.path.dirname(sys.argv[0])
84 | jar = os.path.normpath(dir + "/../lib/jmx-1.0.jar")
85 | if not os.path.exists(jar):
86 | print >>sys.stderr, "WTF?! Can't run, %s doesn't exist" % jar
87 | return 13
88 | classpath = [jar]
89 | for jar in CLASSPATH:
90 | if os.path.exists(jar):
91 | classpath.append(jar)
92 | classpath = ":".join(classpath)
93 |
94 | jmx = subprocess.Popen(
95 | ["java", "-enableassertions", "-enablesystemassertions", # safe++
96 | "-Xmx64m", # Low RAM limit, to avoid stealing too much from prod.
97 | "-cp", classpath, "com.stumbleupon.monitoring.jmx",
98 | "--watch", "10", "--long", "--timestamp",
99 | "HRegionServer", # Name of the process.
100 | # The remaining arguments are pairs (mbean_regexp, attr_regexp).
101 | # The first regexp is used to match one or more MBeans, the 2nd
102 | # to match one or more attributes of the MBeans matched.
103 | "hadoop", "", # All HBase / hadoop metrics.
104 | "Threading", "Count|Time$", # Number of threads and CPU time.
105 | "OperatingSystem", "OpenFile", # Number of open files.
106 | "GarbageCollector", "Collection", # GC runs and time spent GCing.
107 | ], stdout=subprocess.PIPE, bufsize=1)
108 | do_on_signal(signal.SIGINT, kill, jmx)
109 | do_on_signal(signal.SIGPIPE, kill, jmx)
110 | do_on_signal(signal.SIGTERM, kill, jmx)
111 | try:
112 | prev_timestamp = 0
113 | while True:
114 | line = jmx.stdout.readline()
115 |
116 | if not line and jmx.poll() is not None:
117 | break # Nothing more to read and process exited.
118 | elif len(line) < 4:
119 | print >>sys.stderr, "invalid line (too short): %r" % line
120 | continue
121 |
122 | try:
123 | timestamp, metric, value, mbean = line.split("\t", 3)
124 | except ValueError, e:
125 | # Temporary workaround for jmx.jar not printing these lines we
126 | # don't care about anyway properly.
127 | if "java.lang.String" not in line:
128 | print >>sys.stderr, "Can't split line: %r" % line
129 | continue
130 |
131 | # Sanitize the timestamp.
132 | try:
133 | timestamp = int(timestamp)
134 | if timestamp < time.time() - 600:
135 | raise ValueError("timestamp too old: %d" % timestamp)
136 | if timestamp < prev_timestamp:
137 | raise ValueError("timestamp out of order: prev=%d, new=%d"
138 | % (prev_timestamp, timestamp))
139 | except ValueError, e:
140 | print >>sys.stderr, ("Invalid timestamp on line: %r -- %s"
141 | % (line, e))
142 | continue
143 | prev_timestamp = timestamp
144 |
145 | tags = ""
146 | # The JMX metrics have per-request-type metrics like so:
147 | # metricNameNumOps
148 | # metricNameMinTime
149 | # metricNameMaxTime
150 | # metricNameAvgTime
151 | # Group related metrics together in the same metric name, use tags
152 | # to separate the different request types, so we end up with:
153 | # numOps op=metricName
154 | # avgTime op=metricName
155 | # etc, which makes it easier to graph things with the TSD.
156 | if metric.endswith("MinTime"): # We don't care about the minimum
157 | continue # time taken by operations.
158 | elif metric.endswith("NumOps"):
159 | tags = " op=" + metric[:-6]
160 | metric = "numOps"
161 | elif metric.endswith("AvgTime"):
162 | tags = " op=" + metric[:-7]
163 | metric = "avgTime"
164 | elif metric.endswith("MaxTime"):
165 | tags = " op=" + metric[:-7]
166 | metric = "maxTime"
167 |
168 | # mbean is of the form "domain:key=value,...,foo=bar"
169 | mbean_domain, mbean_properties = mbean.rstrip().split(":", 1)
170 | if mbean_domain not in ("hadoop", "java.lang"):
171 | print >>sys.stderr, ("Unexpected mbean domain = %r on line %r"
172 | % (mbean_domain, line))
173 | continue
174 | mbean_properties = dict(prop.split("=", 1)
175 | for prop in mbean_properties.split(","))
176 | if mbean_domain == "hadoop":
177 | # jmx_service is HBase by default, but we can also have
178 | # RegionServer or Replication and such.
179 | jmx_service = mbean_properties.get("service", "HBase")
180 | if jmx_service == "HBase":
181 | jmx_service = "regionserver"
182 | elif mbean_domain == "java.lang":
183 | jmx_service = mbean_properties.pop("type", "jvm")
184 | if mbean_properties:
185 | tags += " " + " ".join(k + "=" + v for k, v in
186 | mbean_properties.iteritems())
187 | else:
188 | assert 0, "Should never be here"
189 |
190 | # Hack. Right now, the RegionServer is printing stats for its own
191 | # replication queue, but when another RegionServer dies, this one
192 | # may take over the replication queue of the dead one. When this
193 | # happens, we'll get the same metrics multiple times, because
194 | # internally the RegionServer has multiple queues (although only
195 | # only one is actively used, the other ones get flushed and
196 | # discarded). The following `if' statement is simply discarding
197 | # stats for "recovered" replication queues, because we can't keep
198 | # track of them properly in TSDB, because there is no sensible
199 | # tag we can use to differentiate queues.
200 | if jmx_service == "Replication":
201 | attr_name = mbean_properties.get("name", "")
202 | # Normally the attribute will look this:
203 | # ReplicationSource for
204 | # Where is the ID of the destination cluster.
205 | # But when this is the recovered queue of a dead RegionServer:
206 | # ReplicationSource for -%2C%2C
207 | # Where , and relate to the dead RS.
208 | # So we discriminate those entries by looking for a dash.
209 | if "ReplicationSource" in attr_name and "-" in attr_name:
210 | continue
211 |
212 | jmx_service = JMX_SERVICE_RENAMING.get(jmx_service, jmx_service)
213 | jmx_service, repl_count = re.subn("[^a-zA-Z0-9]+", ".",
214 | jmx_service)
215 | if repl_count:
216 | print >>sys.stderr, ("Warning: found malformed"
217 | " jmx_service=%r on line=%r"
218 | % (mbean_properties["service"], line))
219 | metric = jmx_service.lower() + "." + metric
220 |
221 | sys.stdout.write("hbase.%s %d %s%s\n"
222 | % (metric, timestamp, value, tags))
223 | sys.stdout.flush()
224 | finally:
225 | kill(jmx)
226 | time.sleep(300)
227 | return 0 # Ask the tcollector to re-spawn us.
228 |
229 |
230 | if __name__ == "__main__":
231 | sys.exit(main(sys.argv))
232 |
--------------------------------------------------------------------------------
/collectors/0/ifstat.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | # This file is part of tcollector.
3 | # Copyright (C) 2010 StumbleUpon, Inc.
4 | #
5 | # This program is free software: you can redistribute it and/or modify it
6 | # under the terms of the GNU Lesser General Public License as published by
7 | # the Free Software Foundation, either version 3 of the License, or (at your
8 | # option) any later version. This program is distributed in the hope that it
9 | # will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty
10 | # of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser
11 | # General Public License for more details. You should have received a copy
12 | # of the GNU Lesser General Public License along with this program. If not,
13 | # see .
14 | #
15 | """network interface stats for TSDB"""
16 |
17 | import os
18 | import sys
19 | import time
20 | import socket
21 | import re
22 |
23 |
24 | # /proc/net/dev has 16 fields, 8 for receive and 8 for xmit
25 | # The fields we care about are defined here. The
26 | # ones we want to skip we just leave empty.
27 | # So we can aggregate up the total bytes, packets, etc
28 | # we tag each metric with direction=in or =out
29 | # and iface=
30 |
31 | FIELDS = ("bytes", "packets", "errs", "dropped",
32 | None, None, None, None,)
33 |
34 | def main():
35 | """ifstat main loop"""
36 | interval = 15
37 |
38 | f_netdev = open("/proc/net/dev", "r")
39 |
40 | # We just care about ethN interfaces. We specifically
41 | # want to avoid bond interfaces, because interface
42 | # stats are still kept on the child interfaces when
43 | # you bond. By skipping bond we avoid double counting.
44 | while True:
45 | f_netdev.seek(0)
46 | ts = int(time.time())
47 | for line in f_netdev:
48 | m = re.match("\s+(eth\d+):(.*)", line)
49 | if not m:
50 | continue
51 | stats = m.group(2).split(None)
52 | for i in range(8):
53 | if FIELDS[i]:
54 | print ("proc.net.%s %d %s iface=%s direction=in"
55 | % (FIELDS[i], ts, stats[i], m.group(1)))
56 | print ("proc.net.%s %d %s iface=%s direction=out"
57 | % (FIELDS[i], ts, stats[i+8], m.group(1)))
58 |
59 | sys.stdout.flush()
60 | time.sleep(interval)
61 |
62 | if __name__ == "__main__":
63 | main()
64 |
65 |
--------------------------------------------------------------------------------
/collectors/0/iostat.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | # This file is part of tcollector.
3 | # Copyright (C) 2010 StumbleUpon, Inc.
4 | #
5 | # This program is free software: you can redistribute it and/or modify it
6 | # under the terms of the GNU Lesser General Public License as published by
7 | # the Free Software Foundation, either version 3 of the License, or (at your
8 | # option) any later version. This program is distributed in the hope that it
9 | # will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty
10 | # of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser
11 | # General Public License for more details. You should have received a copy
12 | # of the GNU Lesser General Public License along with this program. If not,
13 | # see .
14 |
15 | """iostat statistics for TSDB"""
16 |
17 | # data is from /proc/diskstats
18 |
19 | # Calculate disk statistics. We handle 2.6 kernel output only, both
20 | # pre-2.6.25 and post (which added back per-partition disk stats).
21 | # (diskstats output significantly changed from 2.4).
22 | # The fields (from iostats.txt) are mainly rate counters
23 | # (either number of operations or number of milliseconds doing a
24 | # particular operation), so let's just let TSD do the rate
25 | # calculation for us.
26 | #
27 | # /proc/diskstats has 11 stats for a given device
28 | # these are all rate counters except ios_in_progress
29 | # .read_requests Number of reads completed
30 | # .read_merged Number of reads merged
31 | # .read_sectors Number of sectors read
32 | # .msec_read Time in msec spent reading
33 | # .write_requests Number of writes completed
34 | # .write_merged Number of writes merged
35 | # .write_sectors Number of sectors written
36 | # .msec_write Time in msec spent writing
37 | # .ios_in_progress Number of I/O operations in progress
38 | # .msec_total Time in msec doing I/O
39 | # .msec_weighted_total Weighted time doing I/O (multiplied by ios_in_progress)
40 |
41 | # in 2.6.25 and later, by-partition stats are reported same as disks
42 | # in 2.6 before 2.6.25, partitions have 4 stats per partition
43 | # .read_issued
44 | # .read_sectors
45 | # .write_issued
46 | # .write_sectors
47 | # For partitions, these *_issued are counters collected before
48 | # requests are merged, so aren't the same as *_requests (which is
49 | # post-merge, which more closely represents represents the actual
50 | # number of disk transactions).
51 |
52 | # Given that diskstats provides both per-disk and per-partition data,
53 | # for TSDB purposes we want to put them under different metrics (versus
54 | # the same metric and different tags). Otherwise, if you look at a
55 | # given metric, the data for a given box will be double-counted, since
56 | # a given operation will increment both the disk series and the
57 | # partition series. To fix this, we output by-disk data to iostat.disk.*
58 | # and by-partition data to iostat.part.*.
59 |
60 | # TODO: Add additional tags to map partitions/disks back to mount
61 | # points/swap so you can (for example) plot just swap partition
62 | # activity or /var/lib/mysql partition activity no matter which
63 | # disk/partition this happens to be. This is nontrivial, especially
64 | # when you have to handle mapping of /dev/mapper to dm-N, pulling out
65 | # swap partitions from /proc/swaps, etc.
66 |
67 | # TODO: add some generated stats from iostat -x like svctm, await,
68 | # %util. These need to pull in cpu idle counters from /proc.
69 |
70 |
71 | import os
72 | import socket
73 | import sys
74 | import time
75 |
76 | COLLECTION_INTERVAL = 60 # seconds
77 |
78 | # Docs come from the Linux kernel's Documentation/iostats.txt
79 | FIELDS_DISK = (
80 | "read_requests", # Total number of reads completed successfully.
81 | "read_merged", # Adjacent read requests merged in a single req.
82 | "read_sectors", # Total number of sectors read successfully.
83 | "msec_read", # Total number of ms spent by all reads.
84 | "write_requests", # total number of writes completed successfully.
85 | "write_merged", # Adjacent write requests merged in a single req.
86 | "write_sectors", # total number of sectors written successfully.
87 | "msec_write", # Total number of ms spent by all writes.
88 | "ios_in_progress", # Number of actual I/O requests currently in flight.
89 | "msec_total", # Amount of time during which ios_in_progress >= 1.
90 | "msec_weighted_total", # Measure of recent I/O completion time and backlog.
91 | )
92 |
93 | FIELDS_PART = ("read_issued",
94 | "read_sectors",
95 | "write_issued",
96 | "write_sectors",
97 | )
98 |
99 |
100 | def main():
101 | """iostats main loop."""
102 | f_diskstats = open("/proc/diskstats", "r")
103 |
104 | while True:
105 | f_diskstats.seek(0)
106 | ts = int(time.time())
107 | for line in f_diskstats:
108 | # maj, min, devicename, [list of stats, see above]
109 | values = line.split(None)
110 | # shortcut the deduper and just skip disks that
111 | # haven't done a single read. This elimiates a bunch
112 | # of loopback, ramdisk, and cdrom devices but still
113 | # lets us report on the rare case that we actually use
114 | # a ramdisk.
115 | if values[3] == "0":
116 | continue
117 |
118 | if int(values[1]) % 16 == 0 and int(values[0]) > 1:
119 | metric = "iostat.disk."
120 | else:
121 | metric = "iostat.part."
122 |
123 | # Sometimes there can be a slash in the device name, see bug #8.
124 | # TODO(tsuna): Remove the substitution once TSD allows `/' in tags.
125 | device = values[2].replace("/", "_")
126 | if len(values) == 14:
127 | # full stats line
128 | for i in range(11):
129 | print ("%s%s %d %s dev=%s"
130 | % (metric, FIELDS_DISK[i], ts, values[i+3],
131 | device))
132 | elif len(values) == 7:
133 | # partial stats line
134 | for i in range(4):
135 | print ("%s%s %d %s dev=%s"
136 | % (metric, FIELDS_PART[i], ts, values[i+3],
137 | device))
138 | else:
139 | print >> sys.stderr, "Cannot parse /proc/diskstats line: ", line
140 | continue
141 |
142 | sys.stdout.flush()
143 | time.sleep(COLLECTION_INTERVAL)
144 |
145 |
146 |
147 | if __name__ == "__main__":
148 | main()
149 |
150 |
--------------------------------------------------------------------------------
/collectors/0/mysql.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # This file is part of tcollector.
3 | # Copyright (C) 2011 StumbleUpon, Inc.
4 | #
5 | # This program is free software: you can redistribute it and/or modify it
6 | # under the terms of the GNU Lesser General Public License as published by
7 | # the Free Software Foundation, either version 3 of the License, or (at your
8 | # option) any later version. This program is distributed in the hope that it
9 | # will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty
10 | # of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser
11 | # General Public License for more details. You should have received a copy
12 | # of the GNU Lesser General Public License along with this program. If not,
13 | # see .
14 | """Collector for MySQL."""
15 |
16 | import errno
17 | import os
18 | import re
19 | import socket
20 | import stat
21 | import sys
22 | import time
23 |
24 | try:
25 | import MySQLdb
26 | except ImportError:
27 | MySQLdb = None # This is handled gracefully in main()
28 |
29 | # This is really ugly, but we don't have a good way of passing
30 | # configuration data down to the collectors at the moment :(
31 | sys.path.append(os.path.dirname(sys.argv[0]) + "/../etc")
32 | import mysqlconf
33 |
34 | COLLECTION_INTERVAL = 15 # seconds
35 | CONNECT_TIMEOUT = 2 # seconds
36 | # How frequently we try to find new databases.
37 | DB_REFRESH_INTERVAL = 60 # seconds
38 | # Usual locations where to find the default socket file.
39 | DEFAULT_SOCKFILES = set([
40 | "/tmp/mysql.sock", # MySQL's own default.
41 | "/var/lib/mysql/mysql.sock", # RH-type / RPM systems.
42 | "/var/run/mysqld/mysqld.sock", # Debian-type systems.
43 | ])
44 | # Directories under which to search additional socket files.
45 | SEARCH_DIRS = [
46 | "/var/lib/mysql",
47 | ]
48 |
49 | def err(msg):
50 | print >>sys.stderr, msg
51 |
52 | class DB(object):
53 | """Represents a MySQL server (as we can monitor more than 1 MySQL)."""
54 |
55 | def __init__(self, sockfile, dbname, db, cursor, version):
56 | """Constructor.
57 |
58 | Args:
59 | sockfile: Path to the socket file.
60 | dbname: Name of the database for that socket file.
61 | db: A MySQLdb connection opened to that socket file.
62 | cursor: A cursor acquired from that connection.
63 | version: What version is this MySQL running (from `SELECT VERSION()').
64 | """
65 | self.sockfile = sockfile
66 | self.dbname = dbname
67 | self.db = db
68 | self.cursor = cursor
69 | self.version = version
70 | self.master = None
71 | self.slave_bytes_executed = None
72 | self.relay_bytes_relayed = None
73 |
74 | version = version.split(".")
75 | try:
76 | self.major = int(version[0])
77 | self.medium = int(version[1])
78 | except (ValueError, IndexError), e:
79 | self.major = self.medium = 0
80 |
81 | def __str__(self):
82 | return "DB(%r, %r, version=%r)" % (self.sockfile, self.dbname,
83 | self.version)
84 |
85 | def __repr__(self):
86 | return self.__str__()
87 |
88 | def isShowGlobalStatusSafe(self):
89 | """Returns whether or not SHOW GLOBAL STATUS is safe to run."""
90 | # We can't run SHOW GLOBAL STATUS on versions prior to 5.1 because it
91 | # locks the entire database for too long and severely impacts traffic.
92 | return self.major > 5 or (self.major == 5 and self.medium >= 1)
93 |
94 | def query(self, sql):
95 | """Executes the given SQL statement and returns a sequence of rows."""
96 | assert self.cursor, "%s already closed?" % (self,)
97 | try:
98 | self.cursor.execute(sql)
99 | except MySQLdb.OperationalError, (errcode, msg):
100 | if errcode != 2006: # "MySQL server has gone away"
101 | raise
102 | self._reconnect()
103 | return self.cursor.fetchall()
104 |
105 | def close(self):
106 | """Closes the connection to this MySQL server."""
107 | if self.cursor:
108 | self.cursor.close()
109 | self.cursor = None
110 | if self.db:
111 | self.db.close()
112 | self.db = None
113 |
114 | def _reconnect(self):
115 | """Reconnects to this MySQL server."""
116 | self.close()
117 | self.db = mysql_connect(self.sockfile)
118 | self.cursor = self.db.cursor()
119 |
120 |
121 | def mysql_connect(sockfile):
122 | """Connects to the MySQL server using the specified socket file."""
123 | user, passwd = mysqlconf.get_user_password(sockfile)
124 | return MySQLdb.connect(unix_socket=sockfile,
125 | connect_timeout=CONNECT_TIMEOUT,
126 | user=user, passwd=passwd)
127 |
128 |
129 | def todict(db, row):
130 | """Transforms a row (returned by DB.query) into a dict keyed by column names.
131 |
132 | Args:
133 | db: The DB instance from which this row was obtained.
134 | row: A row as returned by DB.query
135 | """
136 | d = {}
137 | for i, field in enumerate(db.cursor.description):
138 | column = field[0].lower() # Lower-case to normalize field names.
139 | d[column] = row[i]
140 | return d
141 |
142 | def get_dbname(sockfile):
143 | """Returns the name of the DB based on the path to the socket file."""
144 | if sockfile in DEFAULT_SOCKFILES:
145 | return "default"
146 | m = re.search("/mysql-(.+)/[^.]+\.sock$", sockfile)
147 | if not m:
148 | err("error: couldn't guess the name of the DB for " + sockfile)
149 | return None
150 | return m.group(1)
151 |
152 |
153 | def is_sockfile(path):
154 | """Returns whether or not the given path is a socket file."""
155 | try:
156 | s = os.stat(path)
157 | except OSError, (no, e):
158 | if no == errno.ENOENT:
159 | return False
160 | err("warning: couldn't stat(%r): %s" % (path, e))
161 | return None
162 | return s.st_mode & stat.S_IFSOCK == stat.S_IFSOCK
163 |
164 |
165 | def find_sockfiles():
166 | """Returns a list of paths to socket files to monitor."""
167 | paths = []
168 | # Look for socket files.
169 | for dir in SEARCH_DIRS:
170 | if not os.path.isdir(dir):
171 | continue
172 | for name in os.listdir(dir):
173 | subdir = os.path.join(dir, name)
174 | if not os.path.isdir(subdir):
175 | continue
176 | for subname in os.listdir(subdir):
177 | path = os.path.join(subdir, subname)
178 | if is_sockfile(path):
179 | paths.append(path)
180 | break # We only expect 1 socket file per DB, so get out.
181 | # Try the default locations.
182 | for sockfile in DEFAULT_SOCKFILES:
183 | if not is_sockfile(sockfile):
184 | continue
185 | paths.append(sockfile)
186 | return paths
187 |
188 |
189 | def find_databases(dbs=None):
190 | """Returns a map of dbname (string) to DB instances to monitor.
191 |
192 | Args:
193 | dbs: A map of dbname (string) to DB instances already monitored.
194 | This map will be modified in place if it's not None.
195 | """
196 | sockfiles = find_sockfiles()
197 | if dbs is None:
198 | dbs = {}
199 | for sockfile in sockfiles:
200 | dbname = get_dbname(sockfile)
201 | if dbname in dbs:
202 | continue
203 | if not dbname:
204 | continue
205 | try:
206 | db = mysql_connect(sockfile)
207 | cursor = db.cursor()
208 | cursor.execute("SELECT VERSION()")
209 | except (EnvironmentError, EOFError, RuntimeError, socket.error,
210 | MySQLdb.MySQLError), e:
211 | err("Couldn't connect to %s: %s" % (sockfile, e))
212 | continue
213 | version = cursor.fetchone()[0]
214 | dbs[dbname] = DB(sockfile, dbname, db, cursor, version)
215 | return dbs
216 |
217 |
218 | def now():
219 | return int(time.time())
220 |
221 |
222 | def isyes(s):
223 | if s.lower() == "yes":
224 | return 1
225 | return 0
226 |
227 |
228 | def collectInnodbStatus(db):
229 | """Collects and prints InnoDB stats about the given DB instance."""
230 | ts = now()
231 | def printmetric(metric, value, tags=""):
232 | print "mysql.%s %d %s schema=%s%s" % (metric, ts, value, db.dbname, tags)
233 |
234 | innodb_status = db.query("SHOW ENGINE INNODB STATUS")[0][2]
235 | m = re.search("^(\d{6}\s+\d{1,2}:\d\d:\d\d) INNODB MONITOR OUTPUT$",
236 | innodb_status, re.M)
237 | if m: # If we have it, try to use InnoDB's own timestamp.
238 | ts = int(time.mktime(time.strptime(m.group(1), "%y%m%d %H:%M:%S")))
239 |
240 | line = None
241 | def match(regexp):
242 | return re.match(regexp, line)
243 |
244 | for line in innodb_status.split("\n"):
245 | # SEMAPHORES
246 | m = match("OS WAIT ARRAY INFO: reservation count (\d+), signal count (\d+)")
247 | if m:
248 | printmetric("innodb.oswait_array.reservation_count", m.group(1))
249 | printmetric("innodb.oswait_array.signal_count", m.group(2))
250 | continue
251 | m = match("Mutex spin waits (\d+), rounds (\d+), OS waits (\d+)")
252 | if m:
253 | printmetric("innodb.locks.spin_waits", m.group(1), " type=mutex")
254 | printmetric("innodb.locks.rounds", m.group(2), " type=mutex")
255 | printmetric("innodb.locks.os_waits", m.group(3), " type=mutex")
256 | continue
257 | m = match("RW-shared spins (\d+), OS waits (\d+);"
258 | " RW-excl spins (\d+), OS waits (\d+)")
259 | if m:
260 | printmetric("innodb.locks.spin_waits", m.group(1), " type=rw-shared")
261 | printmetric("innodb.locks.os_waits", m.group(2), " type=rw-shared")
262 | printmetric("innodb.locks.spin_waits", m.group(3), " type=rw-exclusive")
263 | printmetric("innodb.locks.os_waits", m.group(4), " type=rw-exclusive")
264 | continue
265 | # INSERT BUFFER AND ADAPTIVE HASH INDEX
266 | # TODO(tsuna): According to the code in ibuf0ibuf.c, this line and
267 | # the following one can appear multiple times. I've never seen this.
268 | # If that happens, we need to aggregate the values here instead of
269 | # printing them directly.
270 | m = match("Ibuf: size (\d+), free list len (\d+), seg size (\d+),")
271 | if m:
272 | printmetric("innodb.ibuf.size", m.group(1))
273 | printmetric("innodb.ibuf.free_list_len", m.group(2))
274 | printmetric("innodb.ibuf.seg_size", m.group(3))
275 | continue
276 | m = match("(\d+) inserts, (\d+) merged recs, (\d+) merges")
277 | if m:
278 | printmetric("innodb.ibuf.inserts", m.group(1))
279 | printmetric("innodb.ibuf.merged_recs", m.group(2))
280 | printmetric("innodb.ibuf.merges", m.group(3))
281 | continue
282 | # ROW OPERATIONS
283 | m = match("\d+ queries inside InnoDB, (\d+) queries in queue")
284 | if m:
285 | printmetric("innodb.queries_queued", m.group(1))
286 | continue
287 | m = match("(\d+) read views open inside InnoDB")
288 | if m:
289 | printmetric("innodb.opened_read_views", m.group(1))
290 | continue
291 | # TRANSACTION
292 | m = match("History list length (\d+)")
293 | if m:
294 | printmetric("innodb.history_list_length", m.group(1))
295 | continue
296 |
297 |
298 | def collect(db):
299 | """Collects and prints stats about the given DB instance."""
300 |
301 | ts = now()
302 | def printmetric(metric, value, tags=""):
303 | print "mysql.%s %d %s schema=%s%s" % (metric, ts, value, db.dbname, tags)
304 |
305 | has_innodb = False
306 | if db.isShowGlobalStatusSafe():
307 | for metric, value in db.query("SHOW GLOBAL STATUS"):
308 | try:
309 | if "." in value:
310 | value = float(value)
311 | else:
312 | value = int(value)
313 | except ValueError:
314 | continue
315 | metric = metric.lower()
316 | has_innodb = has_innodb or metric.startswith("innodb")
317 | printmetric(metric, value)
318 |
319 | if has_innodb:
320 | collectInnodbStatus(db)
321 |
322 | if has_innodb and False: # Disabled because it's too expensive for InnoDB.
323 | waits = {} # maps a mutex name to the number of waits
324 | ts = now()
325 | for engine, mutex, status in db.query("SHOW ENGINE INNODB MUTEX"):
326 | if not status.startswith("os_waits"):
327 | continue
328 | m = re.search("&(\w+)(?:->(\w+))?$", mutex)
329 | if not m:
330 | continue
331 | mutex, kind = m.groups()
332 | if kind:
333 | mutex += "." + kind
334 | wait_count = int(status.split("=", 1)[1])
335 | waits[mutex] = waits.get(mutex, 0) + wait_count
336 | for mutex, wait_count in waits.iteritems():
337 | printmetric("innodb.locks", wait_count, " mutex=" + mutex)
338 |
339 | ts = now()
340 |
341 | mysql_slave_status = db.query("SHOW SLAVE STATUS")
342 | if mysql_slave_status:
343 | slave_status = todict(db, mysql_slave_status[0])
344 | master_host = slave_status["master_host"]
345 | else:
346 | master_host = None
347 |
348 | if master_host and master_host != "None":
349 | sbm = slave_status.get("seconds_behind_master")
350 | if isinstance(sbm, (int, long)):
351 | printmetric("slave.seconds_behind_master", sbm)
352 | printmetric("slave.bytes_executed", slave_status["exec_master_log_pos"])
353 | printmetric("slave.bytes_relayed", slave_status["read_master_log_pos"])
354 | printmetric("slave.thread_io_running",
355 | isyes(slave_status["slave_io_running"]))
356 | printmetric("slave.thread_sql_running",
357 | isyes(slave_status["slave_sql_running"]))
358 |
359 | states = {} # maps a connection state to number of connections in that state
360 | for row in db.query("SHOW PROCESSLIST"):
361 | id, user, host, db_, cmd, time, state = row[:7]
362 | states[cmd] = states.get(cmd, 0) + 1
363 | for state, count in states.iteritems():
364 | state = state.lower().replace(" ", "_")
365 | printmetric("connection_states", count, " state=%s" % state)
366 |
367 |
368 | def main(args):
369 | """Collects and dumps stats from a MySQL server."""
370 | if not find_sockfiles(): # Nothing to monitor.
371 | return 13 # Ask tcollector to not respawn us.
372 | if MySQLdb is None:
373 | err("error: Python module `MySQLdb' is missing")
374 | return 1
375 |
376 | last_db_refresh = now()
377 | dbs = find_databases()
378 | while True:
379 | ts = now()
380 | if ts - last_db_refresh >= DB_REFRESH_INTERVAL:
381 | find_databases(dbs)
382 | last_db_refresh = ts
383 |
384 | errs = []
385 | for dbname, db in dbs.iteritems():
386 | try:
387 | collect(db)
388 | except (EnvironmentError, EOFError, RuntimeError, socket.error,
389 | MySQLdb.MySQLError), e:
390 | if isinstance(e, IOError) and e[0] == errno.EPIPE:
391 | # Exit on a broken pipe. There's no point in continuing
392 | # because no one will read our stdout anyway.
393 | return 2
394 | err("error: failed to collect data from %s: %s" % (db, e))
395 | errs.append(dbname)
396 |
397 | for dbname in errs:
398 | del dbs[dbname]
399 |
400 | sys.stdout.flush()
401 | time.sleep(COLLECTION_INTERVAL)
402 |
403 |
404 | if __name__ == "__main__":
405 | sys.stdin.close()
406 | sys.exit(main(sys.argv))
407 |
--------------------------------------------------------------------------------
/collectors/0/netstat.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | # This file is part of tcollector.
3 | # Copyright (C) 2011 StumbleUpon, Inc.
4 | #
5 | # This program is free software: you can redistribute it and/or modify it
6 | # under the terms of the GNU Lesser General Public License as published by
7 | # the Free Software Foundation, either version 3 of the License, or (at your
8 | # option) any later version. This program is distributed in the hope that it
9 | # will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty
10 | # of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser
11 | # General Public License for more details. You should have received a copy
12 | # of the GNU Lesser General Public License along with this program. If not,
13 | # see .
14 |
15 | # Note: I spent many hours reading the Linux kernel's source code to infer the
16 | # exact meaning of some of the obscure but useful metrics it exposes. The
17 | # description of the metrics are correct to the best of my knowledge, but it's
18 | # not always to make sense of the Linux kernel's code. Please report any
19 | # inaccuracy you find. -- tsuna.
20 | """Socket allocation and network statistics for TSDB.
21 |
22 | Metrics from /proc/net/sockstat:
23 | - net.sockstat.num_sockets: Number of sockets allocated (only TCP).
24 | - net.sockstat.num_timewait: Number of TCP sockets currently in
25 | TIME_WAIT state.
26 | - net.sockstat.sockets_inuse: Number of sockets in use (TCP/UDP/raw).
27 | - net.sockstat.num_orphans: Number of orphan TCP sockets (not attached
28 | to any file descriptor).
29 | - net.sockstat.memory: Memory allocated for this socket type (in bytes).
30 | - net.sockstat.ipfragqueues: Number of IP flows for which there are
31 | currently fragments queued for reassembly.
32 |
33 | Metrics from /proc/net/netstat (`netstat -s' command):
34 | - net.stat.tcp.abort: Number of connections that the kernel had to abort.
35 | type=memory is especially bad, the kernel had to drop a connection due to
36 | having too many orphaned sockets. Other types are normal (e.g. timeout).
37 | - net.stat.tcp.abort.failed: Number of times the kernel failed to abort a
38 | connection because it didn't even have enough memory to reset it (bad).
39 | - net.stat.tcp.congestion.recovery: Number of times the kernel detected
40 | spurious retransmits and was able to recover part or all of the CWND.
41 | - net.stat.tcp.delayedack: Number of delayed ACKs sent of different types.
42 | - net.stat.tcp.failed_accept: Number of times a connection had to be dropped
43 | after the 3WHS. reason=full_acceptq indicates that the application isn't
44 | accepting connections fast enough. You should see SYN cookies too.
45 | - net.stat.tcp.invalid_sack: Number of invalid SACKs we saw of diff types.
46 | (requires Linux v2.6.24-rc1 or newer)
47 | - net.stat.tcp.memory.pressure: Number of times a socket entered the
48 | "memory pressure" mode (not great).
49 | - net.stat.tcp.memory.prune: Number of times a socket had to discard
50 | received data due to low memory conditions (bad).
51 | - net.stat.tcp.packetloss.recovery: Number of times we recovered from packet
52 | loss by type of recovery (e.g. fast retransmit vs SACK).
53 | - net.stat.tcp.receive.queue.full: Number of times a received packet had to
54 | be dropped because the socket's receive queue was full.
55 | (requires Linux v2.6.34-rc2 or newer)
56 | - net.stat.tcp.reording: Number of times we detected re-ordering and how.
57 | - net.stat.tcp.syncookies: SYN cookies (both sent & received).
58 | """
59 |
60 | import os
61 | import pwd
62 | import re
63 | import resource
64 | import sys
65 | import time
66 |
67 | # If we're running as root and this user exists, we'll drop privileges.
68 | USER = "nobody"
69 |
70 |
71 | def drop_privileges():
72 | """Drops privileges if running as root."""
73 | try:
74 | ent = pwd.getpwnam(USER)
75 | except KeyError:
76 | return
77 |
78 | if os.getuid() != 0:
79 | return
80 |
81 | os.setgid(ent.pw_gid)
82 | os.setuid(ent.pw_uid)
83 |
84 |
85 |
86 | def main():
87 | """Main loop"""
88 | drop_privileges()
89 | sys.stdin.close()
90 |
91 | interval = 15
92 | page_size = resource.getpagesize()
93 |
94 | try:
95 | sockstat = open("/proc/net/sockstat")
96 | netstat = open("/proc/net/netstat")
97 | except IOError, e:
98 | print >>sys.stderr, "Failed to open /proc/net/sockstat: %s" % e
99 | return 13 # Ask tcollector to not re-start us.
100 |
101 | # Note: up until v2.6.37-rc2 most of the values were 32 bits.
102 | # The first value is pretty useless since it accounts for some
103 | # socket types but not others. So we don't report it because it's
104 | # more confusing than anything else and it's not well documented
105 | # what type of sockets are or aren't included in this count.
106 | regexp = re.compile("sockets: used \d+\n"
107 | "TCP: inuse (?P\d+) orphan (?P\d+)"
108 | " tw (?P\d+) alloc (?P\d+)"
109 | " mem (?P\d+)\n"
110 | "UDP: inuse (?P\d+)"
111 | # UDP memory accounting was added in v2.6.25-rc1
112 | "(?: mem (?P\d+))?\n"
113 | # UDP-Lite (RFC 3828) was added in v2.6.20-rc2
114 | "(?:UDPLITE: inuse (?P\d+)\n)?"
115 | "RAW: inuse (?P\d+)\n"
116 | "FRAG: inuse (?P\d+)"
117 | " memory (?P\d+)\n")
118 |
119 | def print_sockstat(metric, value, tags=""): # Note: tags must start with ' '
120 | if value is not None:
121 | print "net.sockstat.%s %d %s%s" % (metric, ts, value, tags)
122 |
123 |
124 | # If a line in /proc/net/netstat doesn't start with a word in that dict,
125 | # we'll ignore it. We use the value to build the metric name.
126 | known_netstatstypes = {
127 | "TcpExt:": "tcp",
128 | "IpExt:": "ip", # We don't collect anything from here for now.
129 | }
130 |
131 | # Any stat in /proc/net/netstat that doesn't appear in this dict will be
132 | # ignored. If we find a match, we'll use the (metricname, tags).
133 | known_netstats = {
134 | # An application wasn't able to accept a connection fast enough, so
135 | # the kernel couldn't store an entry in the queue for this connection.
136 | # Instead of dropping it, it sent a cookie to the client.
137 | "SyncookiesSent": ("syncookies", "type=sent"),
138 | # After sending a cookie, it came back to us and passed the check.
139 | "SyncookiesRecv": ("syncookies", "type=received"),
140 | # After sending a cookie, it came back to us but looked invalid.
141 | "SyncookiesFailed": ("syncookies", "type=failed"),
142 | # When a socket is using too much memory (rmem), the kernel will first
143 | # discard any out-of-order packet that has been queued (with SACK).
144 | "OfoPruned": ("memory.prune", "type=drop_ofo_queue"),
145 | # If the kernel is really really desperate and cannot give more memory
146 | # to this socket even after dropping the ofo queue, it will simply
147 | # discard the packet it received. This is Really Bad.
148 | "RcvPruned": ("memory.prune", "type=drop_received"),
149 | # We waited for another packet to send an ACK, but didn't see any, so
150 | # a timer ended up sending a delayed ACK.
151 | "DelayedACKs": ("delayedack", "type=sent"),
152 | # We wanted to send a delayed ACK but failed because the socket was
153 | # locked. So the timer was reset.
154 | "DelayedACKLocked": ("delayedack", "type=locked"),
155 | # We sent a delayed and duplicated ACK because the remote peer
156 | # retransmitted a packet, thinking that it didn't get to us.
157 | "DelayedACKLost": ("delayedack", "type=lost"),
158 | # We completed a 3WHS but couldn't put the socket on the accept queue,
159 | # so we had to discard the connection.
160 | "ListenOverflows": ("failed_accept", "reason=full_acceptq"),
161 | # We couldn't accept a connection because one of: we had no route to
162 | # the destination, we failed to allocate a socket, we failed to
163 | # allocate a new local port bind bucket. Note: this counter
164 | # also include all the increments made to ListenOverflows...
165 | "ListenDrops": ("failed_accept", "reason=other"),
166 | # A packet was lost and we recovered after a fast retransmit.
167 | "TCPRenoRecovery": ("packetloss.recovery", "type=fast_retransmit"),
168 | # A packet was lost and we recovered by using selective
169 | # acknowledgements.
170 | "TCPSackRecovery": ("packetloss.recovery", "type=sack"),
171 | # We detected re-ordering using FACK (Forward ACK -- the highest
172 | # sequence number known to have been received by the peer when using
173 | # SACK -- FACK is used during congestion control).
174 | "TCPFACKReorder": ("reording", "detectedby=fack"),
175 | # We detected re-ordering using SACK.
176 | "TCPSACKReorder": ("reording", "detectedby=sack"),
177 | # We detected re-ordering using fast retransmit.
178 | "TCPRenoReorder": ("reording", "detectedby=fast_retransmit"),
179 | # We detected re-ordering using the timestamp option.
180 | "TCPTSReorder": ("reording", "detectedby=timestamp"),
181 | # We detected some erroneous retransmits and undid our CWND reduction.
182 | "TCPFullUndo": ("congestion.recovery", "type=full_undo"),
183 | # We detected some erroneous retransmits, a partial ACK arrived while
184 | # we were fast retransmitting, so we were able to partially undo some
185 | # of our CWND reduction.
186 | "TCPPartialUndo": ("congestion.recovery", "type=hoe_heuristic"),
187 | # We detected some erroneous retransmits, a D-SACK arrived and ACK'ed
188 | # all the retransmitted data, so we undid our CWND reduction.
189 | "TCPDSACKUndo": ("congestion.recovery", "type=sack"),
190 | # We detected some erroneous retransmits, a partial ACK arrived, so we
191 | # undid our CWND reduction.
192 | "TCPLossUndo": ("congestion.recovery", "type=ack"),
193 | # We received an unexpected SYN so we sent a RST to the peer.
194 | "TCPAbortOnSyn": ("abort", "type=unexpected_syn"),
195 | # We were in FIN_WAIT1 yet we received a data packet with a sequence
196 | # number that's beyond the last one for this connection, so we RST'ed.
197 | "TCPAbortOnData": ("abort", "type=data_after_fin_wait1"),
198 | # We received data but the user has closed the socket, so we have no
199 | # wait of handing it to them, so we RST'ed.
200 | "TCPAbortOnClose": ("abort", "type=data_after_close"),
201 | # This is Really Bad. It happens when there are too many orphaned
202 | # sockets (not attached a FD) and the kernel has to drop a connection.
203 | # Sometimes it will send a reset to the peer, sometimes it wont.
204 | "TCPAbortOnMemory": ("abort", "type=out_of_memory"),
205 | # The connection timed out really hard.
206 | "TCPAbortOnTimeout": ("abort", "type=timeout"),
207 | # We killed a socket that was closed by the application and lingered
208 | # around for long enough.
209 | "TCPAbortOnLinger": ("abort", "type=linger"),
210 | # We tried to send a reset, probably during one of teh TCPABort*
211 | # situations above, but we failed e.g. because we couldn't allocate
212 | # enough memory (very bad).
213 | "TCPAbortFailed": ("abort.failed", None),
214 | # Number of times a socket was put in "memory pressure" due to a non
215 | # fatal memory allocation failure (reduces the send buffer size etc).
216 | "TCPMemoryPressures": ("memory.pressure", None),
217 | # We got a completely invalid SACK block and discarded it.
218 | "TCPSACKDiscard": ("invalid_sack", "type=invalid"),
219 | # We got a duplicate SACK while retransmitting so we discarded it.
220 | "TCPDSACKIgnoredOld": ("invalid_sack", "type=retransmit"),
221 | # We got a duplicate SACK and discarded it.
222 | "TCPDSACKIgnoredNoUndo": ("invalid_sack", "type=olddup"),
223 | # We received something but had to drop it because the socket's
224 | # receive queue was full.
225 | "TCPBacklogDrop": ("receive.queue.full", None),
226 | }
227 |
228 |
229 | def print_netstat(statstype, metric, value, tags=""):
230 | if tags:
231 | space = " "
232 | else:
233 | tags = space = ""
234 | print "net.stat.%s.%s %d %s%s%s" % (statstype, metric, ts, value,
235 | space, tags)
236 |
237 | statsdikt = {}
238 | while True:
239 | ts = int(time.time())
240 | sockstat.seek(0)
241 | netstat.seek(0)
242 | data = sockstat.read()
243 | stats = netstat.read()
244 | m = re.match(regexp, data)
245 | if not m:
246 | print >>sys.stderr, "Cannot parse sockstat: %r" % data
247 | return 13
248 |
249 | # The difference between the first two values is the number of
250 | # sockets allocated vs the number of sockets actually in use.
251 | print_sockstat("num_sockets", m.group("tcp_sockets"), " type=tcp")
252 | print_sockstat("num_timewait", m.group("tw_count"))
253 | print_sockstat("sockets_inuse", m.group("tcp_inuse"), " type=tcp")
254 | print_sockstat("sockets_inuse", m.group("udp_inuse"), " type=udp")
255 | print_sockstat("sockets_inuse", m.group("udplite_inuse"), " type=udplite")
256 | print_sockstat("sockets_inuse", m.group("raw_inuse"), " type=raw")
257 |
258 | print_sockstat("num_orphans", m.group("orphans"))
259 | print_sockstat("memory", int(m.group("tcp_pages")) * page_size,
260 | " type=tcp")
261 | if m.group("udp_pages") is not None:
262 | print_sockstat("memory", int(m.group("udp_pages")) * page_size,
263 | " type=udp")
264 | print_sockstat("memory", m.group("ip_frag_mem"), " type=ipfrag")
265 | print_sockstat("ipfragqueues", m.group("ip_frag_nqueues"))
266 |
267 | # /proc/net/netstat has a retarded column-oriented format. It looks
268 | # like this:
269 | # Header: SomeMetric OtherMetric
270 | # Header: 1 2
271 | # OtherHeader: ThirdMetric FooBar
272 | # OtherHeader: 42 51
273 | # We first group all the lines for each header together:
274 | # {"Header:": [["SomeMetric", "OtherHeader"], ["1", "2"]],
275 | # "OtherHeader:": [["ThirdMetric", "FooBar"], ["42", "51"]]}
276 | # Then we'll create a dict for each type:
277 | # {"SomeMetric": "1", "OtherHeader": "2"}
278 | for line in stats.splitlines():
279 | line = line.split()
280 | if line[0] not in known_netstatstypes:
281 | print >>sys.stderr, ("Unrecoginized line in /proc/net/netstat:"
282 | " %r (file=%r)" % (line, stats))
283 | continue
284 | statstype = line.pop(0)
285 | statsdikt.setdefault(known_netstatstypes[statstype], []).append(line)
286 | for statstype, stats in statsdikt.iteritems():
287 | # stats is now:
288 | # [["SyncookiesSent", "SyncookiesRecv", ...], ["1", "2", ....]]
289 | assert len(stats) == 2, repr(statsdikt)
290 | stats = dict(zip(*stats))
291 | value = stats.get("ListenDrops")
292 | if value is not None: # Undo the kernel's double counting
293 | stats["ListenDrops"] = int(value) - int(stats.get("ListenOverflows", 0))
294 | for stat, (metric, tags) in known_netstats.iteritems():
295 | value = stats.get(stat)
296 | if value is not None:
297 | print_netstat(statstype, metric, value, tags)
298 | stats.clear()
299 | statsdikt.clear()
300 |
301 | sys.stdout.flush()
302 | time.sleep(interval)
303 |
304 | if __name__ == "__main__":
305 | sys.exit(main())
306 |
--------------------------------------------------------------------------------
/collectors/0/procnettcp.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | # This file is part of tcollector.
3 | # Copyright (C) 2010 StumbleUpon, Inc.
4 | #
5 | # This program is free software: you can redistribute it and/or modify it
6 | # under the terms of the GNU Lesser General Public License as published by
7 | # the Free Software Foundation, either version 3 of the License, or (at your
8 | # option) any later version. This program is distributed in the hope that it
9 | # will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty
10 | # of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser
11 | # General Public License for more details. You should have received a copy
12 | # of the GNU Lesser General Public License along with this program. If not,
13 | # see .
14 |
15 | """TCP socket state data for TSDB"""
16 | #
17 | # Read /proc/net/tcp, which gives netstat -a type
18 | # data for all TCP sockets.
19 |
20 | # Note this collector generates a lot of lines, given that there are
21 | # lots of tcp states and given the number of subcollections we do.
22 | # We rely heavily on tcollector's deduping. We could be lazy and
23 | # just output values only for which we have data, except if we do
24 | # this then any counters for which we had data would never reach
25 | # zero since our state machine never enters this condition.
26 |
27 | # Metric: proc.net.tcp
28 |
29 | # For each run, we classify each connection and generate subtotals.
30 | # TSD will automatically total these up when displaying
31 | # the graph, but you can drill down for each possible total or a
32 | # particular one. This does generate a large amount of datapoints,
33 | # as the number of points is (S*(U+1)*V) (currently ~400), where
34 | # S=number of TCP states, U=Number of users to track, and
35 | # V=number of services (collections of ports)
36 | # The deduper does dedup this down very well, as only 3 of the 10
37 | # TCP states are generally ever seen, and most servers only run one
38 | # service under one user. On a typical server this dedups down to
39 | # under 10 values per interval.
40 |
41 | # Each connection is broken down with a tag for user=username (see
42 | # "users" list below) or put under "other" if not in the list.
43 | # Expand this for any users you care about.
44 | # It is also broken down for each state (state=).
45 | # It is also broken down into services (collections of ports)
46 |
47 | # Note that once a connection is closed, Linux seems to forget who
48 | # opened/handled the connection. For connections in time_wait, for
49 | # example, they will always show user=root.
50 |
51 | import os
52 | import sys
53 | import time
54 | import socket
55 | import pwd
56 |
57 |
58 | USERS = ("root", "www-data", "mysql")
59 |
60 | # Note if a service runs on multiple ports and you
61 | # want to collectively map them up to a single service,
62 | # just give them the same name below
63 |
64 | PORTS = {
65 | 80: "http",
66 | 443: "https",
67 | 3001: "http-varnish",
68 | 3002: "http-varnish",
69 | 3003: "http-varnish",
70 | 3004: "http-varnish",
71 | 3005: "http-varnish",
72 | 3006: "http-varnish",
73 | 3007: "http-varnish",
74 | 3008: "http-varnish",
75 | 3009: "http-varnish",
76 | 3010: "http-varnish",
77 | 3011: "http-varnish",
78 | 3012: "http-varnish",
79 | 3013: "http-varnish",
80 | 3014: "http-varnish",
81 | 3306: "mysql",
82 | 3564: "mysql",
83 | 9000: "namenode",
84 | 9090: "thriftserver",
85 | 11211: "memcache",
86 | 11212: "memcache",
87 | 11213: "memcache",
88 | 11214: "memcache",
89 | 11215: "memcache",
90 | 11216: "memcache",
91 | 11217: "memcache",
92 | 11218: "memcache",
93 | 11219: "memcache",
94 | 11220: "memcache",
95 | 11221: "memcache",
96 | 11222: "memcache",
97 | 11223: "memcache",
98 | 11224: "memcache",
99 | 11225: "memcache",
100 | 11226: "memcache",
101 | 50020: "datanode",
102 | 60020: "hregionserver",
103 | }
104 |
105 | SERVICES = tuple(set(PORTS.itervalues()))
106 |
107 | TCPSTATES = {
108 | "01": "established",
109 | "02": "syn_sent",
110 | "03": "syn_recv",
111 | "04": "fin_wait1",
112 | "05": "fin_wait2",
113 | "06": "time_wait",
114 | "07": "close",
115 | "08": "close_wait",
116 | "09": "last_ack",
117 | "0A": "listen",
118 | "0B": "closing",
119 | }
120 |
121 | # If we're running as root and this user exists, we'll drop privileges.
122 | USER = "nobody"
123 |
124 |
125 | def drop_privileges():
126 | try:
127 | ent = pwd.getpwnam(USER)
128 | except KeyError:
129 | return
130 |
131 | if os.getuid() != 0:
132 | return
133 |
134 | os.setgid(ent.pw_gid)
135 | os.setuid(ent.pw_uid)
136 |
137 |
138 | def is_public_ip(ipstr):
139 | """
140 | Take a /proc/net/tcp encoded src or dest string
141 | Return True if it is coming from public IP space
142 | (i.e. is not RFC1918, loopback, or broadcast).
143 | This string is the hex ip:port of the connection.
144 | (ip is reversed)
145 | """
146 | addr = ipstr.split(":")[0]
147 | addr = int(addr, 16)
148 | byte1 = addr & 0xFF
149 | byte2 = (addr >> 8) & 0xFF
150 | if byte1 in (10, 0, 127):
151 | return False
152 | if byte1 == 172 and byte2 > 16:
153 | return False
154 | if byte1 == 192 and byte2 == 168:
155 | return False
156 | return True
157 |
158 |
159 | def main(unused_args):
160 | """procnettcp main loop"""
161 | drop_privileges()
162 | try: # On some Linux kernel versions, with lots of connections
163 | os.nice(19) # this collector can be very CPU intensive. So be nicer.
164 | except OSError, e:
165 | print >>sys.stderr, "warning: failed to self-renice:", e
166 |
167 | interval = 60
168 |
169 | # resolve the list of users to match on into UIDs
170 | uids = {}
171 | for user in USERS:
172 | try:
173 | uids[str(pwd.getpwnam(user)[2])] = user
174 | except KeyError:
175 | continue
176 |
177 | try:
178 | tcp = open("/proc/net/tcp")
179 | # if IPv6 is enabled, even IPv4 connections will also
180 | # appear in tcp6. It has the same format, apart from the
181 | # address size
182 | try:
183 | tcp6 = open("/proc/net/tcp6")
184 | except IOError, (errno, msg):
185 | if errno == 2: # No such file => IPv6 is disabled.
186 | tcp6 = None
187 | else:
188 | raise
189 | except IOError, e:
190 | print >>sys.stderr, "Failed to open input file: %s" % (e,)
191 | return 13 # Ask tcollector to not re-start us immediately.
192 |
193 | while True:
194 | counter = {}
195 |
196 | for procfile in (tcp, tcp6):
197 | if procfile is None:
198 | continue
199 | procfile.seek(0)
200 | ts = int(time.time())
201 | for line in procfile:
202 | try:
203 | # pylint: disable=W0612
204 | (num, src, dst, state, queue, when, retrans,
205 | uid, timeout, inode) = line.split(None, 9)
206 | except ValueError: # Malformed line
207 | continue
208 |
209 | if num == "sl": # header
210 | continue
211 |
212 | srcport = src.split(":")[1]
213 | dstport = dst.split(":")[1]
214 | srcport = int(srcport, 16)
215 | dstport = int(dstport, 16)
216 | service = PORTS.get(srcport, "other")
217 | service = PORTS.get(dstport, service)
218 |
219 | if is_public_ip(dst) or is_public_ip(src):
220 | endpoint = "external"
221 | else:
222 | endpoint = "internal"
223 |
224 |
225 | user = uids.get(uid, "other")
226 |
227 | key = "state=" + TCPSTATES[state] + " endpoint=" + endpoint + \
228 | " service=" + service + " user=" + user
229 | if key in counter:
230 | counter[key] += 1
231 | else:
232 | counter[key] = 1
233 |
234 | # output the counters
235 | for state in TCPSTATES:
236 | for service in SERVICES + ("other",):
237 | for user in USERS + ("other",):
238 | for endpoint in ("internal", "external"):
239 | key = ("state=%s endpoint=%s service=%s user=%s"
240 | % (TCPSTATES[state], endpoint, service, user))
241 | if key in counter:
242 | print "proc.net.tcp", ts, counter[key], key
243 | else:
244 | print "proc.net.tcp", ts, "0", key
245 |
246 | sys.stdout.flush()
247 | time.sleep(interval)
248 |
249 | if __name__ == "__main__":
250 | sys.exit(main(sys.argv))
251 |
--------------------------------------------------------------------------------
/collectors/0/procstats.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | # This file is part of tcollector.
3 | # Copyright (C) 2010 StumbleUpon, Inc.
4 | #
5 | # This program is free software: you can redistribute it and/or modify it
6 | # under the terms of the GNU Lesser General Public License as published by
7 | # the Free Software Foundation, either version 3 of the License, or (at your
8 | # option) any later version. This program is distributed in the hope that it
9 | # will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty
10 | # of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser
11 | # General Public License for more details. You should have received a copy
12 | # of the GNU Lesser General Public License along with this program. If not,
13 | # see .
14 | #
15 | """import various /proc stats from /proc into TSDB"""
16 |
17 | import os
18 | import sys
19 | import time
20 | import socket
21 | import re
22 |
23 | COLLECTION_INTERVAL = 15 # seconds
24 | NUMADIR = "/sys/devices/system/node"
25 |
26 |
27 | def open_sysfs_numa_stats():
28 | """Returns a possibly empty list of opened files."""
29 | try:
30 | nodes = os.listdir(NUMADIR)
31 | except OSError, (errno, msg):
32 | if errno == 2: # No such file or directory
33 | return [] # We don't have NUMA stats.
34 | raise
35 |
36 | nodes = [node for node in nodes if node.startswith("node")]
37 | numastats = []
38 | for node in nodes:
39 | try:
40 | numastats.append(open(os.path.join(NUMADIR, node, "numastat")))
41 | except OSError, (errno, msg):
42 | if errno == 2: # No such file or directory
43 | continue
44 | raise
45 | return numastats
46 |
47 |
48 | def print_numa_stats(numafiles):
49 | """From a list of opened files, extracts and prints NUMA stats."""
50 | for numafile in numafiles:
51 | numafile.seek(0)
52 | node_id = int(numafile.name[numafile.name.find("/node/node")+10:-9])
53 | ts = int(time.time())
54 | stats = dict(line.split() for line in numafile.read().splitlines())
55 | for stat, tag in (# hit: process wanted memory from this node and got it
56 | ("numa_hit", "hit"),
57 | # miss: process wanted another node and got it from
58 | # this one instead.
59 | ("numa_miss", "miss")):
60 | print ("sys.numa.zoneallocs %d %s node=%d type=%s"
61 | % (ts, stats[stat], node_id, tag))
62 | # Count this one as a separate metric because we can't sum up hit +
63 | # miss + foreign, this would result in double-counting of all misses.
64 | # See `zone_statistics' in the code of the kernel.
65 | # foreign: process wanted memory from this node but got it from
66 | # another node. So maybe this node is out of free pages.
67 | print ("sys.numa.foreign_allocs %d %s node=%d"
68 | % (ts, stats["numa_foreign"], node_id))
69 | # When is memory allocated to a node that's local or remote to where
70 | # the process is running.
71 | for stat, tag in (("local_node", "local"),
72 | ("other_node", "remote")):
73 | print ("sys.numa.allocation %d %s node=%d type=%s"
74 | % (ts, stats[stat], node_id, tag))
75 | # Pages successfully allocated with the interleave policy.
76 | print ("sys.numa.interleave %d %s node=%d type=hit"
77 | % (ts, stats["interleave_hit"], node_id))
78 |
79 |
80 | def main():
81 | """procstats main loop"""
82 |
83 | f_uptime = open("/proc/uptime", "r")
84 | f_meminfo = open("/proc/meminfo", "r")
85 | f_vmstat = open("/proc/vmstat", "r")
86 | f_stat = open("/proc/stat", "r")
87 | f_loadavg = open("/proc/loadavg", "r")
88 | f_entropy_avail = open("/proc/sys/kernel/random/entropy_avail", "r")
89 | numastats = open_sysfs_numa_stats()
90 |
91 | while True:
92 | # proc.uptime
93 | f_uptime.seek(0)
94 | ts = int(time.time())
95 | for line in f_uptime:
96 | m = re.match("(\S+)\s+(\S+)", line)
97 | if m:
98 | print "proc.uptime.total %d %s" % (ts, m.group(1))
99 | print "proc.uptime.now %d %s" % (ts, m.group(2))
100 |
101 | # proc.meminfo
102 | f_meminfo.seek(0)
103 | ts = int(time.time())
104 | for line in f_meminfo:
105 | m = re.match("(\w+):\s+(\d+)", line)
106 | if m:
107 | print ("proc.meminfo.%s %d %s"
108 | % (m.group(1).lower(), ts, m.group(2)))
109 |
110 | # proc.vmstat
111 | f_vmstat.seek(0)
112 | ts = int(time.time())
113 | for line in f_vmstat:
114 | m = re.match("(\w+)\s+(\d+)", line)
115 | if not m:
116 | continue
117 | if m.group(1) in ("pgpgin", "pgpgout", "pswpin",
118 | "pswpout", "pgfault", "pgmajfault"):
119 | print "proc.vmstat.%s %d %s" % (m.group(1), ts, m.group(2))
120 |
121 | # proc.stat
122 | f_stat.seek(0)
123 | ts = int(time.time())
124 | for line in f_stat:
125 | m = re.match("(\w+)\s+(.*)", line)
126 | if not m:
127 | continue
128 | if m.group(1) == "cpu":
129 | fields = m.group(2).split()
130 | print "proc.stat.cpu %d %s type=user" % (ts, fields[0])
131 | print "proc.stat.cpu %d %s type=nice" % (ts, fields[1])
132 | print "proc.stat.cpu %d %s type=system" % (ts, fields[2])
133 | print "proc.stat.cpu %d %s type=idle" % (ts, fields[3])
134 | print "proc.stat.cpu %d %s type=iowait" % (ts, fields[4])
135 | print "proc.stat.cpu %d %s type=irq" % (ts, fields[5])
136 | print "proc.stat.cpu %d %s type=softirq" % (ts, fields[6])
137 | # really old kernels don't have this field
138 | if len(fields) > 7:
139 | print ("proc.stat.cpu %d %s type=guest"
140 | % (ts, fields[7]))
141 | # old kernels don't have this field
142 | if len(fields) > 8:
143 | print ("proc.stat.cpu %d %s type=guest_nice"
144 | % (ts, fields[8]))
145 | elif m.group(1) == "intr":
146 | print ("proc.stat.intr %d %s"
147 | % (ts, m.group(2).split()[0]))
148 | elif m.group(1) == "ctxt":
149 | print "proc.stat.ctxt %d %s" % (ts, m.group(2))
150 | elif m.group(1) == "processes":
151 | print "proc.stat.processes %d %s" % (ts, m.group(2))
152 | elif m.group(1) == "procs_blocked":
153 | print "proc.stat.procs_blocked %d %s" % (ts, m.group(2))
154 |
155 | f_loadavg.seek(0)
156 | ts = int(time.time())
157 | for line in f_loadavg:
158 | m = re.match("(\S+)\s+(\S+)\s+(\S+)\s+(\d+)/(\d+)\s+", line)
159 | if not m:
160 | continue
161 | print "proc.loadavg.1min %d %s" % (ts, m.group(1))
162 | print "proc.loadavg.5min %d %s" % (ts, m.group(2))
163 | print "proc.loadavg.15min %d %s" % (ts, m.group(3))
164 | print "proc.loadavg.runnable %d %s" % (ts, m.group(4))
165 | print "proc.loadavg.total_threads %d %s" % (ts, m.group(5))
166 |
167 | f_entropy_avail.seek(0)
168 | ts = int(time.time())
169 | for line in f_entropy_avail:
170 | print "proc.kernel.entropy_avail %d %s" % (ts, line.strip())
171 |
172 | print_numa_stats(numastats)
173 |
174 | sys.stdout.flush()
175 | time.sleep(COLLECTION_INTERVAL)
176 |
177 | if __name__ == "__main__":
178 | main()
179 |
180 |
--------------------------------------------------------------------------------
/collectors/0/redis-stats.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | #
3 | # Copyright 2011 by Bump Technologies, Inc.
4 | #
5 | # This program is free software: you can redistribute it and/or modify it
6 | # under the terms of the GNU Lesser General Public License as published by
7 | # the Free Software Foundation, either version 3 of the License, or (at your
8 | # option) any later version. This program is distributed in the hope that it
9 | # will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty
10 | # of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser
11 | # General Public License for more details. You should have received a copy
12 | # of the GNU Lesser General Public License along with this program. If not,
13 | # see .
14 | #
15 | # Written by Mark Smith .
16 | #
17 |
18 | """Statistics from a Redis instance.
19 |
20 | Note: this collector parses your Redis configuration files to determine what cluster
21 | this instance is part of. If you want the cluster tag to be accurate, please edit
22 | your Redis configuration file and add a comment like this somewhere in the file:
23 |
24 | # tcollector.cluster = main
25 |
26 | You can name the cluster anything that matches the regex [a-z0-9-_]+.
27 |
28 | This collector outputs the following metrics:
29 |
30 | - redis.bgrewriteaof_in_progress
31 | - redis.bgsave_in_progress
32 | - redis.blocked_clients
33 | - redis.changes_since_last_save
34 | - redis.client_biggest_input_buf
35 | - redis.client_longest_output_list
36 | - redis.connected_clients
37 | - redis.connected_slaves
38 | - redis.expired_keys
39 | - redis.evicted_keys
40 | - redis.hash_max_zipmap_entries
41 | - redis.hash_max_zipmap_value
42 | - redis.keyspace_hits
43 | - redis.keyspace_misses
44 | - redis.mem_fragmentation_ratio
45 | - redis.pubsub_channels
46 | - redis.pubsub_patterns
47 | - redis.total_commands_processed
48 | - redis.total_connections_received
49 | - redis.uptime_in_seconds
50 | - redis.used_cpu_sys
51 | - redis.used_cpu_user
52 | - redis.used_memory
53 | - redis.used_memory_rss
54 |
55 | For more information on these values, see this (not very useful) documentation:
56 |
57 | http://redis.io/commands/info
58 | """
59 |
60 | import os
61 | import pwd
62 | import re
63 | import subprocess
64 | import sys
65 | import time
66 |
67 | try:
68 | import redis
69 | has_redis = True
70 | except ImportError:
71 | has_redis = False
72 |
73 | # If we are root, drop privileges to this user, if necessary. NOTE: if this is
74 | # not root, this MUST be the user that you run redis-server under. If not, we
75 | # will not be able to find your Redis instances.
76 | USER = "root"
77 |
78 | # Every SCAN_INTERVAL seconds, we look for new redis instances. Prevents the
79 | # situation where you put up a new instance and we never notice.
80 | SCAN_INTERVAL = 300
81 |
82 | # these are the things in the info struct that we care about
83 | KEYS = [
84 | 'pubsub_channels', 'bgrewriteaof_in_progress', 'connected_slaves', 'connected_clients', 'keyspace_misses',
85 | 'used_memory', 'total_commands_processed', 'used_memory_rss', 'total_connections_received', 'pubsub_patterns',
86 | 'used_cpu_sys', 'blocked_clients', 'used_cpu_user', 'expired_keys', 'bgsave_in_progress', 'hash_max_zipmap_entries',
87 | 'hash_max_zipmap_value', 'client_longest_output_list', 'client_biggest_input_buf', 'uptime_in_seconds',
88 | 'changes_since_last_save', 'mem_fragmentation_ratio', 'keyspace_hits', 'evicted_keys'
89 | ];
90 |
91 | def drop_privileges():
92 | """Drops privileges if running as root."""
93 |
94 | if USER == 'root':
95 | return
96 |
97 | try:
98 | ent = pwd.getpwnam(USER)
99 | except KeyError:
100 | return
101 |
102 | if os.getuid() != 0:
103 | return
104 | os.setgid(ent.pw_gid)
105 | os.setuid(ent.pw_uid)
106 |
107 |
108 | def main():
109 | """Main loop"""
110 |
111 | drop_privileges()
112 | sys.stdin.close()
113 |
114 | interval = 15
115 |
116 | # we scan for instances here to see if there are any redis servers
117 | # running on this machine...
118 | last_scan = time.time()
119 | instances = scan_for_instances() # port:name
120 | if not len(instances):
121 | return 13
122 | if not has_redis:
123 | sys.stderr.write("Found %d instance(s) to monitor, but the Python"
124 | " Redis module isn't installed.\n" % len(instances))
125 | return 1
126 |
127 | def print_stat(metric, value, tags=""):
128 | if value is not None:
129 | print "redis.%s %d %s %s" % (metric, ts, value, tags)
130 |
131 | while True:
132 | ts = int(time.time())
133 |
134 | # if we haven't looked for redis instances recently, let's do that
135 | if ts - last_scan > SCAN_INTERVAL:
136 | instances = scan_for_instances()
137 | last_scan = ts
138 |
139 | # now iterate over every instance and gather statistics
140 | for port in instances:
141 | tags = "cluster=%s port=%d" % (instances[port], port)
142 |
143 | # connect to the instance and attempt to gather info
144 | r = redis.Redis(host="127.0.0.1", port=port)
145 | info = r.info()
146 | for key in KEYS:
147 | if key in info:
148 | print_stat(key, info[key], tags)
149 |
150 | # get some instant latency information
151 | # TODO: might be nice to get 95th, 99th, etc here?
152 | start_time = time.time()
153 | r.ping()
154 | print_stat("latency", time.time() - start_time, tags)
155 |
156 | sys.stdout.flush()
157 | time.sleep(interval)
158 |
159 |
160 | def scan_for_instances():
161 | """Use netstat to find instances of Redis listening on the local machine, then
162 | figure out what configuration file they're using to name the cluster."""
163 |
164 | out = {}
165 | tcre = re.compile(r"^\s*#\s*tcollector.(\w+)\s*=\s*(.+)$")
166 |
167 | ns_proc = subprocess.Popen(["netstat", "-tnlp"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
168 | stdout, _ = ns_proc.communicate()
169 | if ns_proc.returncode != 0:
170 | print >> sys.stderr, "failed to find instances %r" % ns_proc.returncode
171 | return {}
172 |
173 | for line in stdout.split("\n"):
174 | if not (line and 'redis-server' in line):
175 | continue
176 | pid = int(line.split()[6].split("/")[0])
177 | port = int(line.split()[3].split(":")[1])
178 |
179 | # now we have to get the command line. we look in the redis config file for
180 | # a special line that tells us what cluster this is. else we default to using
181 | # the port number which should work.
182 | cluster = "port-%d" % port
183 | try:
184 | f = open("/proc/%d/cmdline" % pid)
185 | cfg = f.readline().split("\0")[-2]
186 | f.close()
187 |
188 | f = open(cfg)
189 | for cfgline in f:
190 | result = tcre.match(cfgline)
191 | if result and result.group(1).lower() == "cluster":
192 | cluster = result.group(2).lower()
193 | except EnvironmentError:
194 | # use the default cluster name if anything above failed.
195 | pass
196 |
197 | out[port] = cluster
198 | return out
199 |
200 |
201 | if __name__ == "__main__":
202 | sys.exit(main())
203 |
--------------------------------------------------------------------------------
/collectors/0/riak.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | #
3 | # Copyright 2011 by Bump Technologies, Inc.
4 | #
5 | # This program is free software: you can redistribute it and/or modify it
6 | # under the terms of the GNU Lesser General Public License as published by
7 | # the Free Software Foundation, either version 3 of the License, or (at your
8 | # option) any later version. This program is distributed in the hope that it
9 | # will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty
10 | # of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser
11 | # General Public License for more details. You should have received a copy
12 | # of the GNU Lesser General Public License along with this program. If not,
13 | # see .
14 | #
15 | # Written by Mark Smith .
16 | #
17 |
18 | """A collector to gather statistics from a Riak node.
19 |
20 | The following all have tags of 'type' which can be 'get' or 'put'. Latency
21 | is measured in fractional seconds. All latency values are calculated over the
22 | last 60 seconds and are moving values.
23 |
24 | - riak.vnode.requests
25 | - riak.node.requests
26 | - riak.node.latency.mean
27 | - riak.node.latency.median
28 | - riak.node.latency.95th
29 | - riak.node.latency.99th
30 | - riak.node.latency.100th
31 |
32 | These metrics have no tags and are global:
33 |
34 | - riak.memory.total
35 | - riak.memory.allocated
36 | - riak.executing_mappers
37 | - riak.sys_process_count
38 | - riak.read_repairs
39 | - riak.connections
40 | - riak.connected_nodes
41 | """
42 |
43 | import json
44 | import os
45 | import pwd
46 | import sys
47 | import time
48 | import urllib2
49 |
50 | # If we're running as root and this user exists, we'll drop privileges. Set this
51 | # to 'root' if you don't want to drop privileges.
52 | USER = "nobody"
53 |
54 | MAP = {
55 | 'vnode_gets_total': ('vnode.requests', 'type=get'),
56 | 'vnode_puts_total': ('vnode.requests', 'type=put'),
57 | 'node_gets_total': ('node.requests', 'type=get'),
58 | 'node_puts_total': ('node.requests', 'type=put'),
59 | 'node_get_fsm_time_mean': ('node.latency.mean', 'type=get'),
60 | 'node_get_fsm_time_median': ('node.latency.median', 'type=get'),
61 | 'node_get_fsm_time_95': ('node.latency.95th', 'type=get'),
62 | 'node_get_fsm_time_99': ('node.latency.99th', 'type=get'),
63 | 'node_get_fsm_time_100': ('node.latency.100th', 'type=get'),
64 | 'node_put_fsm_time_mean': ('node.latency.mean', 'type=put'),
65 | 'node_put_fsm_time_median': ('node.latency.median', 'type=put'),
66 | 'node_put_fsm_time_95': ('node.latency.95th', 'type=put'),
67 | 'node_put_fsm_time_99': ('node.latency.99th', 'type=put'),
68 | 'node_put_fsm_time_100': ('node.latency.100th', 'type=put'),
69 | 'pbc_connects_total': ('connections', ''),
70 | 'read_repairs_total': ('read_repairs', ''),
71 | 'sys_process_count': ('sys_process_count', ''),
72 | 'executing_mappers': ('executing_mappers', ''),
73 | 'mem_allocated': ('memory.allocated', ''),
74 | 'mem_total': ('memory.total', ''),
75 | #connected_nodes is calculated
76 | }
77 |
78 | def drop_privileges():
79 | """Drops privileges if running as root."""
80 |
81 | if USER == 'root':
82 | return
83 |
84 | try:
85 | ent = pwd.getpwnam(USER)
86 | except KeyError:
87 | return
88 |
89 | if os.getuid() != 0:
90 | return
91 | os.setgid(ent.pw_gid)
92 | os.setuid(ent.pw_uid)
93 |
94 |
95 | def main():
96 | """Main loop"""
97 |
98 | # don't run if we're not a riak node
99 | if not os.path.exists("/usr/lib/riak"):
100 | sys.exit(13)
101 |
102 | drop_privileges()
103 | sys.stdin.close()
104 |
105 | interval = 15
106 |
107 | def print_stat(metric, value, tags=""):
108 | if value is not None:
109 | print "riak.%s %d %s %s" % (metric, ts, value, tags)
110 |
111 | while True:
112 | ts = int(time.time())
113 |
114 | req = urllib2.urlopen("http://localhost:8098/stats")
115 | if req is not None:
116 | obj = json.loads(req.read())
117 | for key in obj:
118 | if key not in MAP:
119 | continue
120 | # this is a hack, but Riak reports latencies in microseconds. they're fairly useless
121 | # to our human operators, so we're going to convert them to seconds.
122 | if 'latency' in MAP[key][0]:
123 | obj[key] = obj[key] / 1000000.0
124 | print_stat(MAP[key][0], obj[key], MAP[key][1])
125 | if 'connected_nodes' in obj:
126 | print_stat('connected_nodes', len(obj['connected_nodes']), '')
127 | req.close()
128 |
129 | sys.stdout.flush()
130 | time.sleep(interval)
131 |
132 |
133 | if __name__ == "__main__":
134 | sys.exit(main())
135 |
--------------------------------------------------------------------------------
/collectors/0/zfsiostats.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | # This file is part of tcollector.
3 | # Copyright (C) 2012 StumbleUpon, Inc.
4 | #
5 | # This program is free software: you can redistribute it and/or modify it
6 | # under the terms of the GNU Lesser General Public License as published by
7 | # the Free Software Foundation, either version 3 of the License, or (at your
8 | # option) any later version. This program is distributed in the hope that it
9 | # will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty
10 | # of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser
11 | # General Public License for more details. You should have received a copy
12 | # of the GNU Lesser General Public License along with this program. If not,
13 | # see .
14 | #
15 |
16 | import errno
17 | import sys
18 | import time
19 | import subprocess
20 | import re
21 | import signal
22 | import os
23 |
24 |
25 | '''
26 | ZFS I/O and disk space statistics for TSDB
27 |
28 | This plugin tracks, for all pools:
29 |
30 | - I/O
31 | zfs.io.pool.{read_issued, write_issued}
32 | zfs.io.pool.{read_sectors, write_sectors}
33 | zfs.io.device.{read_issued, write_issued}
34 | zfs.io.device.{read_sectors, write_sectors}
35 | - disk space
36 | zfs.df.pool.1kblocks.{total, used, available}
37 | zfs.df.device.1kblocks.{total, used, available}
38 |
39 | Sectors are always 512 bytes. Disk space usage is given in 1K blocks.
40 | Values delivered to standard output are already normalized to be per second.
41 | '''
42 |
43 | def convert_to_bytes(string):
44 | """Take a string in the form 1234K, and convert to bytes"""
45 | factors = {
46 | "K": 1024,
47 | "M": 1024 * 1024,
48 | "G": 1024 * 1024 * 1024,
49 | "T": 1024 * 1024 * 1024 * 1024,
50 | "P": 1024 * 1024 * 1024 * 1024 * 1024,
51 | }
52 | if string == "-": return 0
53 | for f, fm in factors.items():
54 | if string.endswith(f):
55 | number = float(string[:-1])
56 | number = number * fm
57 | return long(number)
58 | return long(string)
59 |
60 | def extract_info(line):
61 | (poolname,
62 | alloc, free,
63 | read_issued, write_issued,
64 | read_sectors, write_sectors) = line.split()
65 |
66 | s_df = {}
67 | # 1k blocks
68 | s_df["used"] = convert_to_bytes(alloc) / 1024
69 | s_df["available"] = convert_to_bytes(free) / 1024
70 | s_df["total"] = s_df["used"] + s_df["available"]
71 |
72 | s_io = {}
73 | # magnitudeless variable
74 | s_io["read_issued"] = read_issued
75 | s_io["write_issued"] = write_issued
76 | # 512 byte sectors
77 | s_io["read_sectors"] = convert_to_bytes(read_sectors) / 512
78 | s_io["write_sectors"] = convert_to_bytes(write_sectors) / 512
79 |
80 | return poolname, s_df, s_io
81 |
82 | T_START = 1
83 | T_HEADERS = 2
84 | T_SEPARATOR = 3
85 | T_POOL = 4
86 | T_DEVICE = 5
87 | T_EMPTY = 6
88 | T_LEG = 7
89 |
90 | signal_received = None
91 | def handlesignal(signum, stack):
92 | global signal_received
93 | signal_received = signum
94 |
95 | def main():
96 | """zfsiostats main loop"""
97 | global signal_received
98 | interval = 15
99 | # shouldn't the interval be determined by the daemon itself, and commu-
100 | # nicated to the collector somehow (signals seem like a reasonable protocol
101 | # whereas command-line parameters also sound reasonable)?
102 |
103 | signal.signal(signal.SIGTERM, handlesignal)
104 | signal.signal(signal.SIGINT, handlesignal)
105 |
106 | try:
107 | p_zpool = subprocess.Popen(
108 | ["zpool", "iostat", "-v", str(interval)],
109 | stdout=subprocess.PIPE,
110 | )
111 | except OSError, e:
112 | if e.errno == errno.ENOENT:
113 | # it makes no sense to run this collector here
114 | sys.exit(13) # we signal tcollector to not run us
115 | raise
116 |
117 | firstloop = True
118 | lastleg = 0
119 | ltype = None
120 | timestamp = int(time.time())
121 | capacity_stats_pool = {}
122 | capacity_stats_device = {}
123 | io_stats_pool = {}
124 | io_stats_device = {}
125 | start_re = re.compile(".*capacity.*operations.*bandwidth")
126 | headers_re = re.compile(".*pool.*alloc.*free.*read.*write.*read.*write")
127 | separator_re = re.compile(".*-----.*-----.*-----")
128 | while signal_received is None:
129 | try:
130 | line = p_zpool.stdout.readline()
131 | except (IOError, OSError), e:
132 | if e.errno in (errno.EINTR, errno.EAGAIN):
133 | break
134 | raise
135 |
136 | if not line:
137 | # end of the program, die
138 | break
139 |
140 | if start_re.match(line):
141 | assert ltype in (None, T_EMPTY), \
142 | "expecting last state T_EMPTY or None, now got %s" % ltype
143 | ltype = T_START
144 | elif headers_re.match(line):
145 | assert ltype == T_START, \
146 | "expecting last state T_START, now got %s" % ltype
147 | ltype = T_HEADERS
148 | elif separator_re.match(line):
149 | assert ltype in (T_DEVICE, T_HEADERS), \
150 | "expecting last state T_DEVICE or T_HEADERS, now got %s" % ltype
151 | ltype = T_SEPARATOR
152 | elif len(line) < 2:
153 | assert ltype == T_SEPARATOR, \
154 | "expecting last state T_SEPARATOR, now got %s" % ltype
155 | ltype = T_EMPTY
156 | elif line.startswith(" mirror"):
157 | assert ltype in (T_POOL, T_DEVICE), \
158 | "expecting last state T_POOL or T_DEVICE, now got %s" % ltype
159 | ltype = T_LEG
160 | elif line.startswith(" "):
161 | assert ltype in (T_POOL, T_DEVICE, T_LEG), \
162 | "expecting last state T_POOL or T_DEVICE or T_LEG, now got %s" % ltype
163 | ltype = T_DEVICE
164 | else:
165 | # must be a pool name
166 | assert ltype == T_SEPARATOR, \
167 | "expecting last state T_SEPARATOR, now got %s" % ltype
168 | ltype = T_POOL
169 |
170 | if ltype == T_START:
171 | for x in (
172 | capacity_stats_pool, capacity_stats_device,
173 | io_stats_pool, io_stats_device,
174 | ):
175 | x.clear()
176 | timestamp = int(time.time())
177 |
178 | elif ltype == T_POOL:
179 | line = line.strip()
180 | poolname, s_df, s_io = extract_info(line)
181 | capacity_stats_pool[poolname] = s_df
182 | io_stats_pool[poolname] = s_io
183 | # marker for leg
184 | last_leg = 0
185 |
186 | elif ltype == T_LEG:
187 | last_leg = last_leg + 1
188 | line = line.strip()
189 | devicename, s_df, s_io = extract_info(line)
190 | capacity_stats_device["%s %s%s" % (poolname, devicename, last_leg)] = s_df
191 | io_stats_device["%s %s%s" % (poolname, devicename, last_leg)] = s_io
192 |
193 | elif ltype == T_DEVICE:
194 | line = line.strip()
195 | devicename, s_df, s_io = extract_info(line)
196 | capacity_stats_device["%s %s" % (poolname, devicename)] = s_df
197 | io_stats_device["%s %s" % (poolname, devicename)] = s_io
198 |
199 | elif ltype == T_EMPTY:
200 | if firstloop:
201 | firstloop = False
202 | else:
203 | # this flag prevents printing out of the data in the first loop
204 | # which is a since-boot summary similar to iostat
205 | # and is useless to us
206 | for poolname, stats in capacity_stats_pool.items():
207 | fm = "zfs.df.pool.1kblocks.%s %d %s poolname=%s"
208 | for statname, statnumber in stats.items():
209 | print fm % (statname, timestamp, statnumber, poolname)
210 | for poolname, stats in io_stats_pool.items():
211 | fm = "zfs.io.pool.%s %d %s poolname=%s"
212 | for statname, statnumber in stats.items():
213 | print fm % (statname, timestamp, statnumber, poolname)
214 | for devicename, stats in capacity_stats_device.items():
215 | fm = "zfs.df.device.1kblocks.%s %d %s devicename=%s poolname=%s"
216 | poolname, devicename = devicename.split(" ", 1)
217 | for statname, statnumber in stats.items():
218 | print fm % (statname, timestamp, statnumber,
219 | devicename, poolname)
220 | for devicename, stats in io_stats_device.items():
221 | fm = "zfs.io.device.%s %d %s devicename=%s poolname=%s"
222 | poolname, devicename = devicename.split(" ", 1)
223 | for statname, statnumber in stats.items():
224 | print fm % (statname, timestamp, statnumber,
225 | devicename, poolname)
226 | sys.stdout.flush()
227 | # if this was the first loop, well, we're onto the second loop
228 | # so we turh the flag off
229 |
230 | if signal_received is None:
231 | signal_received = signal.SIGTERM
232 | try:
233 | os.kill(p_zpool.pid, signal_received)
234 | except Exception:
235 | pass
236 | p_zpool.wait()
237 |
238 | if __name__ == "__main__":
239 | main()
240 |
241 |
--------------------------------------------------------------------------------
/collectors/0/zfskernstats.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | # This file is part of tcollector.
3 | # Copyright (C) 2012 StumbleUpon, Inc.
4 | #
5 | # This program is free software: you can redistribute it and/or modify it
6 | # under the terms of the GNU Lesser General Public License as published by
7 | # the Free Software Foundation, either version 3 of the License, or (at your
8 | # option) any later version. This program is distributed in the hope that it
9 | # will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty
10 | # of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser
11 | # General Public License for more details. You should have received a copy
12 | # of the GNU Lesser General Public License along with this program. If not,
13 | # see .
14 | #
15 |
16 | import errno
17 | import re
18 | import sys
19 | import time
20 |
21 | '''
22 | ZFS kernel memory statistics for TSDB
23 |
24 | This plugin tracks kernel memory for both:
25 |
26 | - the SPL and its allocated slabs backing ZFS memory
27 | zfs.mem.slab
28 | - the ARC and its various values
29 | zfs.mem.arc
30 | '''
31 |
32 | # /proc/spl/slab has several fields. we only care about the sizes
33 | # and the allocation sizes for the slabs
34 | # /proc/spl/kstat/zfs/arcstats is a table. we only care about the data column
35 |
36 | def main():
37 | """zfsstat main loop"""
38 | interval = 15
39 | typere = re.compile("(^.*)_[0-9]+$")
40 |
41 | try:
42 | f_slab = open("/proc/spl/kmem/slab", "r")
43 | f_arcstats = open("/proc/spl/kstat/zfs/arcstats", "r")
44 | except IOError, e:
45 | if e.errno == errno.ENOENT:
46 | # it makes no sense to run this collector here
47 | sys.exit(13) # we signal tcollector to not run us
48 | raise
49 |
50 | while True:
51 | f_slab.seek(0)
52 | f_arcstats.seek(0)
53 | ts = int(time.time())
54 |
55 | for n, line in enumerate(f_slab):
56 | if n < 2:
57 | continue
58 | line = line.split()
59 | name, _, size, alloc, _, objsize = line[0:6]
60 | size, alloc, objsize = int(size), int(alloc), int(objsize)
61 | typ = typere.match(name)
62 | if typ:
63 | typ = typ.group(1)
64 | else:
65 | typ = name
66 | print ("zfs.mem.slab.size %d %d type=%s objsize=%d" %
67 | (ts, size, typ, objsize)
68 | )
69 | print ("zfs.mem.slab.alloc %d %d type=%s objsize=%d" %
70 | (ts, alloc, typ, objsize)
71 | )
72 |
73 | for n, line in enumerate(f_arcstats):
74 | if n < 2:
75 | continue
76 | line = line.split()
77 | name, _, data = line
78 | data = int(data)
79 | print ("zfs.mem.arc.%s %d %d" %
80 | (name, ts, data)
81 | )
82 |
83 | sys.stdout.flush()
84 | time.sleep(interval)
85 |
86 | if __name__ == "__main__":
87 | main()
88 |
89 |
--------------------------------------------------------------------------------
/collectors/etc/config.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | # This file is part of tcollector.
3 | # Copyright (C) 2010 StumbleUpon, Inc.
4 | #
5 | # This program is free software: you can redistribute it and/or modify it
6 | # under the terms of the GNU Lesser General Public License as published by
7 | # the Free Software Foundation, either version 3 of the License, or (at your
8 | # option) any later version. This program is distributed in the hope that it
9 | # will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty
10 | # of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser
11 | # General Public License for more details. You should have received a copy
12 | # of the GNU Lesser General Public License along with this program. If not,
13 | # see .
14 |
15 | # This 'onload' function will be called by tcollector when it starts up.
16 | # You can put any code here that you want to load inside the tcollector.
17 | # This also gives you a chance to override the options from the command
18 | # line or to add custom sanity checks on their values.
19 | # You can also use this to change the global tags that will be added to
20 | # every single data point. For instance if you have multiple different
21 | # pools or clusters of machines, you might wanna lookup the name of the
22 | # pool or cluster the current host belongs to and add it to the tags.
23 | # Throwing an exception here will cause the tcollector to die before it
24 | # starts doing any work.
25 | # Python files in this directory that don't have an "onload" function
26 | # will be imported by tcollector too, but no function will be called.
27 | # When this file executes, you can assume that its directory is in
28 | # sys.path, so you can import other Python modules from this directory
29 | # or its subdirectories.
30 |
31 | def onload(options, tags):
32 | """Function called by tcollector when it starts up.
33 |
34 | Args:
35 | options: The options as returned by the OptionParser.
36 | tags: A dictionnary that maps tag names to tag values.
37 | """
38 | pass
39 |
--------------------------------------------------------------------------------
/collectors/etc/mysqlconf.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | def get_user_password(sockfile):
4 | """Given the path of a socket file, returns a tuple (user, password)."""
5 | return ("root", "")
6 |
--------------------------------------------------------------------------------
/collectors/lib/jmx-1.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/StumbleUponArchive/tcollector/e09b09153131823b12bfca6824ee90c1d361a011/collectors/lib/jmx-1.0.jar
--------------------------------------------------------------------------------
/startstop:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Semi Universal start-stop script
4 |
5 | # TSD_HOST=dns.name.of.tsd
6 | TCOLLECTOR_PATH=${TCOLLECTOR_PATH-'/usr/local/tcollector'}
7 | test -n "$TSD_HOST" || {
8 | echo >&2 "TSD_HOST is not set in $0"
9 | exit 1
10 | }
11 |
12 | HOSTNAME=$(hostname)
13 | PIDFILE=${PIDFILE-'/var/run/tcollector.pid'}
14 | PROG=$TCOLLECTOR_PATH/tcollector.py
15 | LOG=${LOG-'/var/log/tcollector.log'}
16 | COMMAND=$1
17 | shift
18 | ARGS="-c $TCOLLECTOR_PATH/collectors -H $TSD_HOST -t host=$HOSTNAME -P $PIDFILE"
19 | ARGS="$ARGS $@"
20 |
21 | # Sanity checks.
22 | test -d "$TCOLLECTOR_PATH" || {
23 | echo >&2 "No such directory: $TCOLLECTOR_PATH"
24 | echo >&2 "You might need to set the TCOLLECTOR_PATH variable in $0"
25 | exit 2
26 | }
27 |
28 | test -f "$PROG" || {
29 | echo >&2 "No such file: $PROG"
30 | echo >&2 "You might need to set the TCOLLECTOR_PATH variable in $0"
31 | exit 3
32 | }
33 |
34 | for i in "$PIDFILE" "$LOG"; do
35 | # If the file doesn't exist, check that we have write access to its parent
36 | # directory to be able to create it.
37 | test -e "$i" || i=`dirname "$i"`
38 | test -w "$i" || {
39 | echo >&2 "$0: error: Cannot write to $i"
40 | exit 4
41 | }
42 | done
43 |
44 | which_python () {
45 | for python in /usr/bin/python2.6 /usr/bin/python2.5 /usr/bin/python; do
46 | test -x "$python" && echo "$python" && return
47 | done
48 | echo >&2 'Could not find a Python interpreter'
49 | exit 1
50 | }
51 |
52 | PYTHON=$(which_python)
53 |
54 | start () {
55 | echo "Starting $PROG"
56 | $PYTHON $PROG $ARGS >> $LOG 2>&1 &
57 | }
58 |
59 | # stop [signum]
60 | stop () {
61 | echo "Stopping $PROG"
62 | pkill $1 -f "/usr/bin/python.* $PROG -c"
63 | }
64 |
65 | status () {
66 | if pgrep -f "/usr/bin/python.* $PROG -c" >/dev/null; then
67 | echo "$PROG" running
68 | return 0
69 | fi
70 | return 1
71 | }
72 |
73 | forcerestart () {
74 | stop
75 | try=1
76 | sleep 1
77 | while status; do
78 | try=$((try + 1))
79 | if [[ $try -gt 3 ]]; then
80 | stop -9
81 | else
82 | stop
83 | fi
84 | echo "Waiting for $PROG to die.."
85 | sleep 5
86 | done
87 | start
88 | }
89 |
90 | case $COMMAND in
91 | start) status || start
92 | ;;
93 | force-restart)
94 | forcerestart
95 | ;;
96 | restart)
97 | # tcollector already respawns collectors if they
98 | # have changed on-disk, and kills old ones/starts
99 | # new ones. The only thing tcollector doesn't do
100 | # is restart itself if itself has changed. For a more
101 | # graceful restart, just make sure we're running and
102 | # restart only if tcollector is newer on disk than
103 | # since it started. This doesn't check for dependencies
104 | # like asyncproc.py, but that's ok.
105 | if status; then
106 | newer=$(find $PROG -newer $PIDFILE | wc -l)
107 | if [[ $newer -gt 0 ]]; then
108 | forcerestart
109 | fi
110 | else
111 | start
112 | fi
113 | ;;
114 | stop) stop
115 | ;;
116 | status) status
117 | exit $?
118 | ;;
119 | *) echo >&2 "usage: $0 "
120 | exit 1
121 | ;;
122 | esac
123 |
--------------------------------------------------------------------------------
/stumbleupon/monitoring/.gitignore:
--------------------------------------------------------------------------------
1 | build
2 |
--------------------------------------------------------------------------------
/stumbleupon/monitoring/Makefile:
--------------------------------------------------------------------------------
1 | # Copyright 2010 StumbleUpon, Inc.
2 | #
3 | # This library is free software: you can redistribute it and/or modify it
4 | # under the terms of the GNU Lesser General Public License as published
5 | # by the Free Software Foundation, either version 3 of the License, or
6 | # (at your option) any later version.
7 | #
8 | # This library is distributed in the hope that it will be useful,
9 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | # GNU Lesser General Public License for more details.
12 | #
13 | # You should have received a copy of the GNU Lesser General Public License
14 | # along with this library. If not, see .
15 |
16 | all: jmx
17 |
18 | top_builddir = build
19 | package = com.stumbleupon.monitoring
20 | spec_title = Monitoring Tools
21 | spec_vendor = StumbleUpon, Inc.
22 | spec_version = 1.0
23 | jmx_JAVA = \
24 | jmx.java \
25 |
26 | jmx_LIBADD = \
27 | /usr/lib/jvm/java-6-sun/lib/tools.jar \
28 |
29 | AM_JAVACFLAGS = -Xlint -source 6
30 | JVM_ARGS =
31 | package_dir = $(subst .,/,$(package))
32 | jmx_classes=$(jmx_JAVA:%.java=$(top_builddir)/$(package_dir)/%.class)
33 | jmx_jar = $(top_builddir)/jmx-$(spec_version).jar
34 |
35 | jmx: $(jmx_jar)
36 |
37 | jmx_get_dep_classpath = `echo $(jmx_LIBADD) | tr ' ' ':'`
38 | $(top_builddir)/.javac-stamp: $(jmx_JAVA)
39 | @mkdir -p $(top_builddir)
40 | javac $(AM_JAVACFLAGS) -cp $(jmx_get_dep_classpath) \
41 | -d $(top_builddir) $(jmx_JAVA)
42 | @touch "$@"
43 |
44 | classes_with_nested_classes = $(jmx_classes:$(top_builddir)/%.class=%*.class)
45 |
46 | pkg_version = \
47 | `git rev-list --pretty=format:%h HEAD --max-count=1 | sed 1d || echo unknown`
48 | $(top_builddir)/manifest: $(top_builddir)/.javac-stamp ../../.git/HEAD
49 | { echo "Specification-Title: $(spec_title)"; \
50 | echo "Specification-Version: $(spec_version)"; \
51 | echo "Specification-Vendor: $(spec_vendor)"; \
52 | echo "Implementation-Title: $(package)"; \
53 | echo "Implementation-Version: $(pkg_version)"; \
54 | echo "Implementation-Vendor: $(spec_vendor)"; } >"$@"
55 |
56 | $(jmx_jar): $(top_builddir)/manifest $(top_builddir)/.javac-stamp $(jmx_classes)
57 | cd $(top_builddir) && jar cfm `basename $(jmx_jar)` manifest $(classes_with_nested_classes) \
58 | || { rv=$$? && rm -f `basename $(jar)` && exit $$rv; }
59 | # ^^^^^^^^^^^^^^^^^^^^^^^
60 | # I've seen cases where `jar' exits with an error but leaves a partially built .jar file!
61 |
62 | doc: $(top_builddir)/api/index.html
63 |
64 | JDK_JAVADOC=http://download.oracle.com/javase/6/docs/api
65 | $(top_builddir)/api/index.html: $(jmx_JAVA) $(BUILT_SOURCES)
66 | javadoc -d $(top_builddir)/api -classpath $(get_dep_classpath) \
67 | -link $(JDK_JAVADOC) -link $(jmx_JAVA) $(BUILT_SOURCES)
68 |
69 | clean:
70 | @rm -f $(top_builddir)/.javac-stamp
71 | rm -f $(top_builddir)/manifest $(BUILT_SOURCES)
72 | cd $(top_builddir) || exit 0 && rm -f $(classes_with_nested_classes)
73 | cd $(top_builddir) || exit 0 \
74 | && test -d $(package_dir) || exit 0 \
75 | && find $(package_dir) -type d -depth -exec rmdir {} ';' \
76 | && dir=$(package_dir) && dir=$${dir%/*} \
77 | && while test x"$$dir" != x"$${dir%/*}"; do \
78 | rmdir "$$dir" && dir=$${dir%/*} || break; \
79 | done \
80 | && rmdir "$$dir"
81 |
82 | distclean: clean
83 | rm -f $(jar)
84 | rm -rf $(top_builddir)/api
85 | test ! -d $(top_builddir) || rmdir $(top_builddir)
86 |
87 | .PHONY: all jmx clean distclean doc check
88 |
--------------------------------------------------------------------------------
/stumbleupon/monitoring/jmx.java:
--------------------------------------------------------------------------------
1 | // This file is part of OpenTSDB.
2 | // Copyright (C) 2010 StumbleUpon, Inc.
3 | //
4 | // This program is free software: you can redistribute it and/or modify it
5 | // under the terms of the GNU Lesser General Public License as published by
6 | // the Free Software Foundation, either version 3 of the License, or (at your
7 | // option) any later version. This program is distributed in the hope that it
8 | // will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty
9 | // of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser
10 | // General Public License for more details. You should have received a copy
11 | // of the GNU Lesser General Public License along with this program. If not,
12 | // see .
13 |
14 | /** Quick CLI tool to get JMX MBean attributes. */
15 | package com.stumbleupon.monitoring;
16 |
17 | import java.io.File;
18 | import java.io.IOException;
19 | import java.util.ArrayList;
20 | import java.util.Collection;
21 | import java.util.Collections;
22 | import java.util.Comparator;
23 | import java.util.HashMap;
24 | import java.util.Iterator;
25 | import java.util.Map;
26 | import java.util.Set;
27 | import java.util.TreeMap;
28 | import java.util.regex.Pattern;
29 | import java.util.regex.PatternSyntaxException;
30 |
31 | import javax.management.MBeanAttributeInfo;
32 | import javax.management.MBeanInfo;
33 | import javax.management.MBeanServerConnection;
34 | import javax.management.ObjectName;
35 | import javax.management.openmbean.TabularData;
36 | import javax.management.remote.JMXConnector;
37 | import javax.management.remote.JMXConnectorFactory;
38 | import javax.management.remote.JMXServiceURL;
39 |
40 | // Sun specific
41 | import com.sun.tools.attach.AgentInitializationException;
42 | import com.sun.tools.attach.AgentLoadException;
43 | import com.sun.tools.attach.AttachNotSupportedException;
44 | import com.sun.tools.attach.VirtualMachine;
45 | import com.sun.tools.attach.VirtualMachineDescriptor;
46 |
47 | // Sun private
48 | import sun.management.ConnectorAddressLink;
49 | import sun.jvmstat.monitor.HostIdentifier;
50 | import sun.jvmstat.monitor.MonitoredHost;
51 | import sun.jvmstat.monitor.MonitoredVm;
52 | import sun.jvmstat.monitor.MonitoredVmUtil;
53 | import sun.jvmstat.monitor.VmIdentifier;
54 |
55 | final class jmx {
56 |
57 | private static final String LOCAL_CONNECTOR_ADDRESS =
58 | "com.sun.management.jmxremote.localConnectorAddress";
59 |
60 | private static void usage() {
61 | System.out.println("Usage:\n"
62 | + " jmx -l Lists all reachable VMs.\n"
63 | + " jmx Lists all MBeans for this JVM (PID or regexp).\n"
64 | + " jmx Prints all the attributes of this MBean.\n"
65 | + " jmx Prints the matching attributes of this MBean.\n"
66 | + "\n"
67 | + "You can pass multiple pairs to match multiple different\n"
68 | + "attributes for different MBeans. For example:\n"
69 | + " jmx --long JConsole Class Count Thread Total Garbage Collection\n"
70 | + " LoadedClassCount 2808 java.lang:type=ClassLoading\n"
71 | + " UnloadedClassCount 0 java.lang:type=ClassLoading\n"
72 | + " TotalLoadedClassCount 2808 java.lang:type=ClassLoading\n"
73 | + " CollectionCount 0 java.lang:type=GarbageCollector,name=ConcurrentMarkSweep\n"
74 | + " CollectionTime 0 java.lang:type=GarbageCollector,name=ConcurrentMarkSweep\n"
75 | + " CollectionCount 1 java.lang:type=GarbageCollector,name=ParNew\n"
76 | + " CollectionTime 19 java.lang:type=GarbageCollector,name=ParNew\n"
77 | + " TotalStartedThreadCount 43 java.lang:type=Threading\n"
78 | + "The command above searched for a JVM with `JConsole' in its name, and then searched\n"
79 | + "for MBeans with `Class' in the name and `Count' in the attribute (first 3 matches\n"
80 | + "in this output), MBeans with `Thread' in the name and `Total' in the attribute (last\n"
81 | + "line in the output) and MBeans matching `Garbage' with a `Collection' attribute.\n"
82 | + "\n"
83 | + "Other flags you can pass:\n"
84 | + " --long Print a longer but more explicit output for each value.\n"
85 | + " --timestamp Print a timestamp at the beginning of each line.\n"
86 | + " --watch N Reprint the output every N seconds.\n"
87 | + "\n"
88 | + "Return value:\n"
89 | + " 0: Everything OK.\n"
90 | + " 1: Invalid usage or unexpected error.\n"
91 | + " 2: No JVM matched.\n"
92 | + " 3: No MBean matched.\n"
93 | + " 4: No attribute matched for the MBean(s) selected.");
94 | }
95 |
96 | private static void fatal(final int rv, final String errmsg) {
97 | System.err.println(errmsg);
98 | System.exit(rv);
99 | throw new AssertionError("You should never see this, really.");
100 | }
101 |
102 | public static void main(final String[] args) throws Exception {
103 | if (args.length == 0 || "-h".equals(args[0]) || "--help".equals(args[0])) {
104 | usage();
105 | System.exit(args.length == 0 ? 1 : 0);
106 | return;
107 | }
108 |
109 | int current_arg = 0;
110 | int watch = 0;
111 | boolean long_output = false;
112 | boolean print_timestamps = false;
113 | while (current_arg < args.length) {
114 | if ("--watch".equals(args[current_arg])) {
115 | current_arg++;
116 | try {
117 | watch = Integer.parseInt(args[current_arg]);
118 | } catch (NumberFormatException e) {
119 | fatal(1, "Invalid value for --watch: " + e.getMessage());
120 | return;
121 | }
122 | if (watch < 1) {
123 | fatal(1, "Invalid value for --watch: " + watch);
124 | }
125 | current_arg++;
126 | } else if ("--long".equals(args[current_arg])) {
127 | long_output = true;
128 | current_arg++;
129 | } else if ("--timestamp".equals(args[current_arg])) {
130 | print_timestamps = true;
131 | current_arg++;
132 | } else {
133 | break;
134 | }
135 | }
136 |
137 | if (current_arg == args.length) {
138 | usage();
139 | fatal(1, "error: Missing argument (-l or JVM specification).");
140 | return;
141 | }
142 |
143 | HashMap vms = getJVMs();
144 | if ("-l".equals(args[current_arg])) {
145 | printVmList(vms.values());
146 | return;
147 | }
148 |
149 | final JVM jvm = selectJVM(args[current_arg++], vms);
150 | vms = null;
151 | final JMXConnector connection = JMXConnectorFactory.connect(jvm.jmxUrl());
152 | try {
153 | final MBeanServerConnection mbsc = connection.getMBeanServerConnection();
154 | if (args.length == current_arg) {
155 | for (final ObjectName mbean : listMBeans(mbsc)) {
156 | System.out.println(mbean);
157 | }
158 | return;
159 | }
160 |
161 | final TreeMap objects = selectMBeans(args, current_arg, mbsc);
162 | if (objects.isEmpty()) {
163 | fatal(3, "No MBean matched your query in " + jvm.name());
164 | return;
165 | }
166 | do {
167 | boolean found = false;
168 | for (final Map.Entry entry : objects.entrySet()) {
169 | final ObjectName object = entry.getKey();
170 | final MBeanInfo mbean = mbsc.getMBeanInfo(object);
171 | final Pattern wanted = entry.getValue();
172 | for (final MBeanAttributeInfo attr : mbean.getAttributes()) {
173 | if (wanted == null || wanted.matcher(attr.getName()).find()) {
174 | dumpMBean(long_output, print_timestamps, mbsc, object, attr);
175 | found = true;
176 | }
177 | }
178 | }
179 | if (!found) {
180 | fatal(4, "No attribute of " + objects.keySet()
181 | + " matched your query in " + jvm.name());
182 | return;
183 | }
184 | System.out.flush();
185 | Thread.sleep(watch * 1000);
186 | } while (watch > 0);
187 | } finally {
188 | connection.close();
189 | }
190 | }
191 |
192 | private static TreeMap selectMBeans(final String[] args,
193 | final int current_arg,
194 | final MBeanServerConnection mbsc) throws IOException {
195 | final TreeMap mbeans = new TreeMap();
196 | for (int i = current_arg; i < args.length; i += 2) {
197 | final Pattern object_re = compile_re(args[i]);
198 | final Pattern attr_re = i + 1 < args.length ? compile_re(args[i + 1]) : null;
199 | for (final ObjectName o : listMBeans(mbsc)) {
200 | if (object_re.matcher(o.toString()).find()) {
201 | mbeans.put(o, attr_re);
202 | }
203 | }
204 | }
205 | return mbeans;
206 | }
207 |
208 | private static void dumpMBean(final boolean long_output,
209 | final boolean print_timestamps,
210 | final MBeanServerConnection mbsc,
211 | final ObjectName object,
212 | final MBeanAttributeInfo attr) throws Exception {
213 | final String name = attr.getName();
214 | Object value = mbsc.getAttribute(object, name);
215 | if (value instanceof TabularData) {
216 | final TabularData tab = (TabularData) value;
217 | int i = 0;
218 | for (final Object o : tab.keySet()) {
219 | dumpMBeanValue(long_output, print_timestamps, object, name + "." + i, o);
220 | i++;
221 | }
222 | } else {
223 | dumpMBeanValue(long_output, print_timestamps, object, name, value);
224 | }
225 | }
226 |
227 | private static void dumpMBeanValue(final boolean long_output,
228 | final boolean print_timestamps,
229 | final ObjectName object,
230 | final String name,
231 | final Object value) {
232 | final StringBuilder buf = new StringBuilder();
233 | final long timestamp = System.currentTimeMillis() / 1000;
234 | if (print_timestamps) {
235 | buf.append(timestamp).append('\t');
236 | }
237 | if (value instanceof Object[]) {
238 | for (final Object o : (Object[]) value) {
239 | buf.append(o).append('\t');
240 | }
241 | buf.setLength(buf.length() - 1);
242 | } else {
243 | buf.append(name).append('\t').append(value);
244 | }
245 | if (long_output) {
246 | buf.append('\t').append(object);
247 | }
248 | buf.append('\n');
249 | System.out.print(buf);
250 | }
251 |
252 | private static ArrayList listMBeans(final MBeanServerConnection mbsc) throws IOException {
253 | ArrayList mbeans = new ArrayList(mbsc.queryNames(null, null));
254 | Collections.sort(mbeans, new Comparator() {
255 | public int compare(final ObjectName a, final ObjectName b) {
256 | return a.toString().compareTo(b.toString());
257 | }
258 | });
259 | return mbeans;
260 | }
261 |
262 | private static Pattern compile_re(final String re) {
263 | try {
264 | return Pattern.compile(re);
265 | } catch (PatternSyntaxException e) {
266 | fatal(1, "Invalid regexp: " + re + ", " + e.getMessage());
267 | throw new AssertionError("Should never be here");
268 | }
269 | }
270 |
271 | private static final String MAGIC_STRING = "this.is.jmx.magic";
272 |
273 | private static JVM selectJVM(final String selector,
274 | final HashMap vms) {
275 | String error = null;
276 | try {
277 | final int pid = Integer.parseInt(selector);
278 | if (pid < 2) {
279 | throw new IllegalArgumentException("Invalid PID: " + pid);
280 | }
281 | final JVM jvm = vms.get(pid);
282 | if (jvm != null) {
283 | return jvm;
284 | }
285 | error = "Couldn't find a JVM with PID " + pid;
286 | } catch (NumberFormatException e) {
287 | /* Ignore. */
288 | }
289 | if (error == null) {
290 | try {
291 | final Pattern p = compile_re(selector);
292 | final ArrayList matches = new ArrayList(2);
293 | for (final JVM jvm : vms.values()) {
294 | if (p.matcher(jvm.name()).find()) {
295 | matches.add(jvm);
296 | }
297 | }
298 | // Exclude ourselves from the matches.
299 | System.setProperty(MAGIC_STRING,
300 | "LOL Java processes can't get their own PID");
301 | final String me = jmx.class.getName();
302 | final Iterator it = matches.iterator();
303 | while (it.hasNext()) {
304 | final JVM jvm = it.next();
305 | final String name = jvm.name();
306 | // Ignore other long running jmx clients too.
307 | if (name.contains("--watch") && name.contains(me)) {
308 | it.remove();
309 | continue;
310 | }
311 | final VirtualMachine vm = VirtualMachine.attach(String.valueOf(jvm.pid()));
312 | try {
313 | if (vm.getSystemProperties().containsKey(MAGIC_STRING)) {
314 | it.remove();
315 | continue;
316 | }
317 | } finally {
318 | vm.detach();
319 | }
320 | }
321 | System.clearProperty(MAGIC_STRING);
322 | if (matches.size() == 0) {
323 | error = "No JVM matched your regexp " + selector;
324 | } else if (matches.size() > 1) {
325 | printVmList(matches);
326 | error = matches.size() + " JVMs matched your regexp " + selector
327 | + ", it's too ambiguous, please refine it.";
328 | } else {
329 | return matches.get(0);
330 | }
331 | } catch (PatternSyntaxException e) {
332 | error = "Invalid pattern: " + selector + ", " + e.getMessage();
333 | } catch (Exception e) {
334 | e.printStackTrace();
335 | error = "Unexpected Exception: " + e.getMessage();
336 | }
337 | }
338 | fatal(2, error);
339 | return null;
340 | }
341 |
342 | private static void printVmList(final Collection vms) {
343 | final ArrayList sorted_vms = new ArrayList(vms);
344 | Collections.sort(sorted_vms, new Comparator() {
345 | public int compare(final JVM a, final JVM b) {
346 | return a.pid() - b.pid();
347 | }
348 | });
349 | for (final JVM jvm : sorted_vms) {
350 | System.out.println(jvm.pid() + "\t" + jvm.name());
351 | }
352 | }
353 |
354 | private static final class JVM {
355 | final int pid;
356 | final String name;
357 | String address;
358 |
359 | public JVM(final int pid, final String name, final String address) {
360 | if (name.isEmpty()) {
361 | throw new IllegalArgumentException("empty name");
362 | }
363 | this.pid = pid;
364 | this.name = name;
365 | this.address = address;
366 | }
367 |
368 | public int pid() {
369 | return pid;
370 | }
371 |
372 | public String name() {
373 | return name;
374 | }
375 |
376 | public JMXServiceURL jmxUrl() {
377 | if (address == null) {
378 | ensureManagementAgentStarted();
379 | }
380 | try {
381 | return new JMXServiceURL(address);
382 | } catch (Exception e) {
383 | throw new RuntimeException("Error", e);
384 | }
385 | }
386 |
387 | public void ensureManagementAgentStarted() {
388 | if (address != null) { // already started
389 | return;
390 | }
391 | VirtualMachine vm;
392 | try {
393 | vm = VirtualMachine.attach(String.valueOf(pid));
394 | } catch (AttachNotSupportedException e) {
395 | throw new RuntimeException("Failed to attach to " + this, e);
396 | } catch (IOException e) {
397 | throw new RuntimeException("Failed to attach to " + this, e);
398 | }
399 | try {
400 | // java.sun.com/javase/6/docs/technotes/guides/management/agent.html#gdhkz
401 | // + code mostly stolen from JConsole's code.
402 | final String home = vm.getSystemProperties().getProperty("java.home");
403 |
404 | // Normally in ${java.home}/jre/lib/management-agent.jar but might
405 | // be in ${java.home}/lib in build environments.
406 |
407 | String agent = home + File.separator + "jre" + File.separator
408 | + "lib" + File.separator + "management-agent.jar";
409 | File f = new File(agent);
410 | if (!f.exists()) {
411 | agent = home + File.separator + "lib" + File.separator
412 | + "management-agent.jar";
413 | f = new File(agent);
414 | if (!f.exists()) {
415 | throw new RuntimeException("Management agent not found");
416 | }
417 | }
418 |
419 | agent = f.getCanonicalPath();
420 | try {
421 | vm.loadAgent(agent, "com.sun.management.jmxremote");
422 | } catch (AgentLoadException e) {
423 | throw new RuntimeException("Failed to load the agent into " + this, e);
424 | } catch (AgentInitializationException e) {
425 | throw new RuntimeException("Failed to initialize the agent into " + this, e);
426 | }
427 | address = (String) vm.getAgentProperties().get(LOCAL_CONNECTOR_ADDRESS);
428 | } catch (IOException e) {
429 | throw new RuntimeException("Error while loading agent into " + this, e);
430 | } finally {
431 | try {
432 | vm.detach();
433 | } catch (IOException e) {
434 | throw new RuntimeException("Failed to detach from " + vm + " = " + this, e);
435 | }
436 | }
437 | if (address == null) {
438 | throw new RuntimeException("Couldn't start the management agent.");
439 | }
440 | }
441 |
442 | public String toString() {
443 | return "JVM(" + pid + ", \"" + name + "\", "
444 | + (address == null ? null : '"' + address + '"') + ')';
445 | }
446 | }
447 |
448 | /**
449 | * Returns a map from PID to JVM.
450 | */
451 | private static HashMap getJVMs() throws Exception {
452 | final HashMap vms = new HashMap();
453 | getMonitoredVMs(vms);
454 | getAttachableVMs(vms);
455 | return vms;
456 | }
457 |
458 | private static void getMonitoredVMs(final HashMap out) throws Exception {
459 | final MonitoredHost host =
460 | MonitoredHost.getMonitoredHost(new HostIdentifier((String) null));
461 | @SuppressWarnings("unchecked")
462 | final Set vms = host.activeVms();
463 | for (final Integer pid : vms) {
464 | try {
465 | final VmIdentifier vmid = new VmIdentifier(pid.toString());
466 | final MonitoredVm vm = host.getMonitoredVm(vmid);
467 | out.put(pid, new JVM(pid, MonitoredVmUtil.commandLine(vm),
468 | ConnectorAddressLink.importFrom(pid)));
469 | vm.detach();
470 | } catch (Exception x) {
471 | System.err.println("Ignoring exception:");
472 | x.printStackTrace();
473 | }
474 | }
475 | }
476 |
477 | private static void getAttachableVMs(final HashMap out) {
478 | for (final VirtualMachineDescriptor vmd : VirtualMachine.list()) {
479 | int pid;
480 | try {
481 | pid = Integer.parseInt(vmd.id());
482 | } catch (NumberFormatException e) {
483 | System.err.println("Ignoring invalid vmd.id(): " + vmd.id()
484 | + ' ' + e.getMessage());
485 | continue;
486 | }
487 | if (out.containsKey(pid)) {
488 | continue;
489 | }
490 | try {
491 | final VirtualMachine vm = VirtualMachine.attach(vmd);
492 | out.put(pid, new JVM(pid, String.valueOf(pid),
493 | (String) vm.getAgentProperties().get(LOCAL_CONNECTOR_ADDRESS)));
494 | vm.detach();
495 | } catch (AttachNotSupportedException e) {
496 | System.err.println("VM not attachable: " + vmd.id()
497 | + ' ' + e.getMessage());
498 | } catch (IOException e) {
499 | System.err.println("Could not attach: " + vmd.id()
500 | + ' ' + e.getMessage());
501 | }
502 | }
503 | }
504 |
505 | }
506 |
--------------------------------------------------------------------------------
/stumbleupon/tcollector.pp:
--------------------------------------------------------------------------------
1 | # Example Puppet manifest for updating/starting tcollector
2 | # under puppet
3 |
4 | class tcollector {
5 | package { python:
6 | ensure => installed,
7 | }
8 |
9 | service { tcollector:
10 | ensure => running,
11 | require => [Package["python"], File["/usr/local/tcollector"]],
12 | start => "/usr/local/tcollector/startstop start",
13 | stop => "/usr/local/tcollector/startstop stop",
14 | restart => "/usr/local/tcollector/startstop restart",
15 | status => "/usr/local/tcollector/startstop status",
16 | subscribe => File["/usr/local/tcollector"],
17 | }
18 |
19 | file { ["/usr/local"]:
20 | owner => root, group => root, mode => 755,
21 | ensure => directory,
22 | }
23 |
24 | file { "/usr/local/tcollector":
25 | source => "puppet:///files/tcollector",
26 | owner => root, group => root,
27 | ensure => directory,
28 | recurse => true,
29 | ignore => '*.pyc',
30 | purge => true,
31 | force => true,
32 | require => File["/usr/local"],
33 | }
34 | }
35 |
--------------------------------------------------------------------------------