├── LICENSE
├── README.md
├── ceph_log_parser.awk
├── ceph_log_parser.luminous.awk
├── compacting_timing.awk
├── deep-scrub_timing.awk
├── images
├── Histogram.png
├── OSDs.png
└── Pools.png
├── iops_histo.sh
├── map_reporters_to_buckets.sh
├── rgw_proc_time.awk
├── rgw_req_timing.sh
└── slow_by_osd-pool-type.awk
/LICENSE:
--------------------------------------------------------------------------------
1 | GNU LESSER GENERAL PUBLIC LICENSE
2 | Version 3, 29 June 2007
3 |
4 | Copyright (C) 2007 Free Software Foundation, Inc.
5 | Everyone is permitted to copy and distribute verbatim copies
6 | of this license document, but changing it is not allowed.
7 |
8 |
9 | This version of the GNU Lesser General Public License incorporates
10 | the terms and conditions of version 3 of the GNU General Public
11 | License, supplemented by the additional permissions listed below.
12 |
13 | 0. Additional Definitions.
14 |
15 | As used herein, "this License" refers to version 3 of the GNU Lesser
16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU
17 | General Public License.
18 |
19 | "The Library" refers to a covered work governed by this License,
20 | other than an Application or a Combined Work as defined below.
21 |
22 | An "Application" is any work that makes use of an interface provided
23 | by the Library, but which is not otherwise based on the Library.
24 | Defining a subclass of a class defined by the Library is deemed a mode
25 | of using an interface provided by the Library.
26 |
27 | A "Combined Work" is a work produced by combining or linking an
28 | Application with the Library. The particular version of the Library
29 | with which the Combined Work was made is also called the "Linked
30 | Version".
31 |
32 | The "Minimal Corresponding Source" for a Combined Work means the
33 | Corresponding Source for the Combined Work, excluding any source code
34 | for portions of the Combined Work that, considered in isolation, are
35 | based on the Application, and not on the Linked Version.
36 |
37 | The "Corresponding Application Code" for a Combined Work means the
38 | object code and/or source code for the Application, including any data
39 | and utility programs needed for reproducing the Combined Work from the
40 | Application, but excluding the System Libraries of the Combined Work.
41 |
42 | 1. Exception to Section 3 of the GNU GPL.
43 |
44 | You may convey a covered work under sections 3 and 4 of this License
45 | without being bound by section 3 of the GNU GPL.
46 |
47 | 2. Conveying Modified Versions.
48 |
49 | If you modify a copy of the Library, and, in your modifications, a
50 | facility refers to a function or data to be supplied by an Application
51 | that uses the facility (other than as an argument passed when the
52 | facility is invoked), then you may convey a copy of the modified
53 | version:
54 |
55 | a) under this License, provided that you make a good faith effort to
56 | ensure that, in the event an Application does not supply the
57 | function or data, the facility still operates, and performs
58 | whatever part of its purpose remains meaningful, or
59 |
60 | b) under the GNU GPL, with none of the additional permissions of
61 | this License applicable to that copy.
62 |
63 | 3. Object Code Incorporating Material from Library Header Files.
64 |
65 | The object code form of an Application may incorporate material from
66 | a header file that is part of the Library. You may convey such object
67 | code under terms of your choice, provided that, if the incorporated
68 | material is not limited to numerical parameters, data structure
69 | layouts and accessors, or small macros, inline functions and templates
70 | (ten or fewer lines in length), you do both of the following:
71 |
72 | a) Give prominent notice with each copy of the object code that the
73 | Library is used in it and that the Library and its use are
74 | covered by this License.
75 |
76 | b) Accompany the object code with a copy of the GNU GPL and this license
77 | document.
78 |
79 | 4. Combined Works.
80 |
81 | You may convey a Combined Work under terms of your choice that,
82 | taken together, effectively do not restrict modification of the
83 | portions of the Library contained in the Combined Work and reverse
84 | engineering for debugging such modifications, if you also do each of
85 | the following:
86 |
87 | a) Give prominent notice with each copy of the Combined Work that
88 | the Library is used in it and that the Library and its use are
89 | covered by this License.
90 |
91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license
92 | document.
93 |
94 | c) For a Combined Work that displays copyright notices during
95 | execution, include the copyright notice for the Library among
96 | these notices, as well as a reference directing the user to the
97 | copies of the GNU GPL and this license document.
98 |
99 | d) Do one of the following:
100 |
101 | 0) Convey the Minimal Corresponding Source under the terms of this
102 | License, and the Corresponding Application Code in a form
103 | suitable for, and under terms that permit, the user to
104 | recombine or relink the Application with a modified version of
105 | the Linked Version to produce a modified Combined Work, in the
106 | manner specified by section 6 of the GNU GPL for conveying
107 | Corresponding Source.
108 |
109 | 1) Use a suitable shared library mechanism for linking with the
110 | Library. A suitable mechanism is one that (a) uses at run time
111 | a copy of the Library already present on the user's computer
112 | system, and (b) will operate properly with a modified version
113 | of the Library that is interface-compatible with the Linked
114 | Version.
115 |
116 | e) Provide Installation Information, but only if you would otherwise
117 | be required to provide such information under section 6 of the
118 | GNU GPL, and only to the extent that such information is
119 | necessary to install and execute a modified version of the
120 | Combined Work produced by recombining or relinking the
121 | Application with a modified version of the Linked Version. (If
122 | you use option 4d0, the Installation Information must accompany
123 | the Minimal Corresponding Source and Corresponding Application
124 | Code. If you use option 4d1, you must provide the Installation
125 | Information in the manner specified by section 6 of the GNU GPL
126 | for conveying Corresponding Source.)
127 |
128 | 5. Combined Libraries.
129 |
130 | You may place library facilities that are a work based on the
131 | Library side by side in a single library together with other library
132 | facilities that are not Applications and are not covered by this
133 | License, and convey such a combined library under terms of your
134 | choice, if you do both of the following:
135 |
136 | a) Accompany the combined library with a copy of the same work based
137 | on the Library, uncombined with any other library facilities,
138 | conveyed under the terms of this License.
139 |
140 | b) Give prominent notice with the combined library that part of it
141 | is a work based on the Library, and explaining where to find the
142 | accompanying uncombined form of the same work.
143 |
144 | 6. Revised Versions of the GNU Lesser General Public License.
145 |
146 | The Free Software Foundation may publish revised and/or new versions
147 | of the GNU Lesser General Public License from time to time. Such new
148 | versions will be similar in spirit to the present version, but may
149 | differ in detail to address new problems or concerns.
150 |
151 | Each version is given a distinguishing version number. If the
152 | Library as you received it specifies that a certain numbered version
153 | of the GNU Lesser General Public License "or any later version"
154 | applies to it, you have the option of following the terms and
155 | conditions either of that published version or of any later version
156 | published by the Free Software Foundation. If the Library as you
157 | received it does not specify a version number of the GNU Lesser
158 | General Public License, you may choose any version of the GNU Lesser
159 | General Public License ever published by the Free Software Foundation.
160 |
161 | If the Library as you received it specifies that a proxy can decide
162 | whether future versions of the GNU Lesser General Public License shall
163 | apply, that proxy's public statement of acceptance of any version is
164 | permanent authorization for you to choose that version for the
165 | Library.
166 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # ceph-log-parsers
2 | Tools for parsing ceph logs to help with troubleshooting various issues.
3 |
4 | ## Tool Explanations:
5 | NOTE: I've shortened the sample outputs below with elipses for the sake of brevity.
6 |
7 | #### ceph_log_parser.awk
8 | - Run with ceph.log and redirect output to a CSV
9 |
10 | ```
11 | ./ceph_log_parser.awk ceph.log > ceph-log-parsed.csv
12 | ./ceph_log_parser.awk -v osdtree=ceph_osd_tree.txt -v timeinterval=60 -v bucketsummary=1 ceph.log > ceph-log-parsed.csv
13 | ```
14 |
15 | Available options:
16 |
17 | -v osdtree=ceph_osd_tree.txt
18 | If provided, the osd output portion will be output with its branch path in the crushmap
19 |
20 | -v timeinterval=(1|10|60|day)
21 | If provided, adjusts the time alignment for the histogram output. Default is 10 (minutes)
22 |
23 | -v bucketsummary=1
24 | If provided, provides an output below the OSD data summarizing the OSD counts for each
25 | successive bucket branch above the OSD ( example: host, rack, row, root )
26 | Default is 1 if 'osdtree' is defined.
27 |
28 | -v osdhisto=1
29 | Provides a column per OSD in the time histogram showing initial 'slow request' entries
30 | incurred by that OSD during the time interval.
31 | Default is disabled because this can make VERY wide spreadsheets
32 |
33 | NOTE: These options MUST be specified **BEFORE** the ceph.log file, otherwise they will be
34 | ignored
35 |
36 |
37 | * For items which are average, these are summed and averaged over the measurement interval
38 | The measurement is reported at the beginning of the interval measurement period
39 |
40 | e.g IO: Client Read MB/s for 03:30 to 03:40 is averaged, then reported on the 03:30 line
41 |
42 | * For items which are a static snapshot, these are reported based on the last line containing those
43 | details in the log before the end of the measurement interval
44 |
45 | e.g. PG: active for 03:30 to 03:40 - If a pgmap is found at 03:39:59, that will be the one reported for
46 | the 03:30 line
47 |
48 | * For items like the Slow requests, the count of those entries is summed during the measured period and reported
49 |
50 | e.g. If there are 50 'slow request ' logs in the 10 minute interval which are for a primary OSD, then 50 is reported
51 | If there are 50 'slow request ' logs 'waiting for subop', then the OSDs called out by the subop (comma
52 | separated numbers), are all counted in the 'Slow SubOp' line. For 3x replication, and 50 lines, the reported
53 | number would be 100 (due to 2x non-primary copies * 50 lines)
54 |
55 | * NOTE: Slow request processing has changed as of 27 Feb 2018. The intial slow request (delay < 60 seconds) and
56 | relogged slow requests ( delay>60 seconds ) are logged separately to better understand if an issue is ongoing.
57 |
58 |
59 | ##### ATTENTION:
60 | - This command output among all others really should be looked at in a spreadsheet tool. I typically highlight the headers (at the top of each section), bold them, rotate them so the text is vertical, then auto-adjust the column widths to get a more concise view which is much easier to visually parse. Graphing of the data in this report can also make trends stand out and help with reducing the scope for hunting faulting components.
61 |
62 | ###### Example:
63 | ```
64 | # ./ceph_log_parser.awk -v osdtree=ceph_osd_tree.txt -v timeinterval=10 -v bucketsummary=1 ceph.log > ~/ceph-log-parsed.csv
65 | # cat ~/ceph-log-parsed.csv
66 |
67 | DateTime,Deep-Scrub: OK,Deep-Scrub: Starts,IO: Client Avg IOPs,IO: Client Avg Read MB/s,IO: Client Avg Write MB/s,IO: Recovery Avg MB/s,OSD Boot,OSD Down: No PG stats,OSD Down: Reported Failed,OSD Out,
68 | OSD Wrongly,OSDs,OSDs IN,OSDs UP,Objects: Degraded Percent,Objects: Misplaced Percent,PG: activating,PG: active,PG: backfill_toofull,PG: backfilling,PG: clean,PG: deep,PG: degraded,PG: down,PG: inactiv
69 | e,PG: incomplete,PG: peered,PG: peering,PG: recovering,PG: recovery_wait,PG: remapped,PG: scrubbing,PG: stale,PG: undersized,PG: wait_backfill,Slow Primary,Slow Primary: commit_sent,Slow Primary: no fl
70 | ag points reached,Slow Primary: reached_pg,Slow Primary: started,Slow Primary: waiting for degraded object,Slow Primary: waiting for missing object,Slow Primary: waiting for ondisk,Slow Primary: waitin
71 | g for rw locks,Slow Primary: waiting for scrub,Slow SubOp,Slow Total,Space (TB): Data Stored,Space (TB): Free,Space (TB): Raw Used,Space (TB): Total
72 | 2018-01-21 03:10:00,,6,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
73 | 2018-01-21 03:20:00,19,12,10193.47,132.71,86.42,,,,,,,249,236,236,,,,32960,,,32960,,,,,,,,,,,,,,,,,,,,,,,,,,,133.00,235.00,405.00,641.00
74 | 2018-01-21 03:30:00,6,7,11243.27,214.92,70.60,,,,,,,,,,,,,32960,,,32960,1,,,,,,,,,,1,,,,,,,,,,,,,,,,133.00,235.00,405.00,641.00
75 | 2018-01-21 03:40:00,9,8,9566.01,202.62,73.42,,,,,,,249,236,236,,,,32960,,,32960,,,,,,,,,,,,,,,,,,,,,,,,,,,133.00,235.00,405.00,641.00
76 | 2018-01-21 03:50:00,1,1,8549.33,163.93,71.18,,,,,,,249,236,236,,,,32960,,,32960,,,,,,,,,,,,,,,,,,,,,,,,,,,133.00,235.00,405.00,641.00
77 | 2018-01-21 04:00:00,,,8331.46,121.57,65.20,,,,,,,,,,,,,32960,,,32960,,,,,,,,,,,,,,,,,,,,,,,,,,,133.00,235.00,405.00,641.00
78 | 2018-01-21 04:10:00,11,13,7480.16,58.25,80.61,,,,,,,249,236,236,,,,32960,,,32960,2,,,,,,,,,,2,,,,,,,,,,,,,,,,133.00,235.00,405.00,641.00
79 | 2018-01-21 04:20:00,13,11,7202.10,41.08,66.31,,,,,,,,,,,,,32960,,,32960,,,,,,,,,,,,,,,,,,,,,,,,,,,133.00,235.00,405.00,641.00
80 | ...
81 | 2018-01-22 03:20:00,13,21,7216.23,88046.62,99450.79,0.00,141,,199,39,144,249,197,178,19.106,10.282,500,26598,,88,15737,4,13503,26,2,,2221,3639,143,2206,5011,4,780,11410,4486,80786,106,4351,74801,7,43,3,,1475,,5465,86251,132.00,198.00,337.00,535.00
82 | 2018-01-22 03:30:00,12,18,9517.31,156494.72,200462.23,0.00,,3,8,22,,249,175,167,26.780,21.825,5,29840,,90,17326,11,15559,66,,,3042,73,63,3678,9054,11,816,12360,8934,45929,292,8915,36457,18,2,5,,240,,6408,52337,133.00,174.00,300.00,475.00
83 | 2018-01-22 03:40:00,25,18,5481.73,52358.01,71523.06,0.00,1,,1,8,1,249,167,167,26.497,25.867,7,29920,,55,19215,3,13677,66,,,2967,66,58,3391,10242,3,816,10242,10167,18953,146,4204,14561,15,1,3,,23,,2464,21417,133.00,165.00,288.00,453.00
84 | 2018-01-22 03:50:00,10,11,7.30,0.03,0.08,0.00,,,,,,249,167,167,26.201,25.723,1,29942,,66,20370,4,12523,66,,,2951,66,43,2299,10192,4,816,10192,10115,15095,94,3218,11773,10,,,,,,1422,16517,133.00,163.00,290.00,453.00
85 | 2018-01-22 04:00:00,27,33,6.93,0.00,0.07,0.00,,,,,,249,167,167,25.892,25.512,,29981,,76,21228,10,11666,66,,,2913,66,23,1514,10136,10,816,10136,10053,10415,138,2452,7809,4,,,,12,,914,11329,133.00,162.00,291.00,453.00
86 | 2018-01-22 04:10:00,37,38,11.08,5704.84,7398.04,0.00,1,,1,,1,249,167,167,25.571,25.226,,30042,,93,21682,10,11211,66,,,2852,66,12,1138,10068,10,816,10068,9968,4657,63,885,3697,12,,,,,,518,5175,133.00,160.00,293.00,453.00
87 | 2018-01-22 04:20:00,28,22,5.14,0.10,0.15,0.00,,,,,,249,167,167,25.219,24.890,1,30116,,103,22079,5,10814,66,,,2777,66,14,818,9986,5,816,9986,9879,7952,38,1040,6870,4,,,,,,584,8536,133.00,159.00,294.00,453.00
88 | 2018-01-22 04:30:00,13,12,50.82,15611.23,13142.18,0.00,1,,1,,1,249,167,167,24.858,24.539,,30198,,104,22402,4,10492,66,,,2696,66,5,590,9900,4,816,9900,9793,10170,52,2320,7795,3,,,,,,818,10988,133.00,157.00,296.00,453.00
89 | 2018-01-22 04:40:00,18,23,12.58,0.02,0.03,0.00,,,1,,,249,167,166,24.873,24.018,,30209,,111,22214,7,10665,77,,4,2670,77,6,455,9754,7,816,10208,9640,6481,63,1135,5278,5,,,,,,554,7035,133.00,156.00,297.00,453.00
90 | 2018-01-22 04:50:00,22,18,146.06,1123.10,1301.40,0.00,,,,1,,249,166,166,24.490,24.207,,30304,,96,22559,5,10320,72,,9,2575,72,3,490,9829,5,816,9828,9731,3935,36,626,3271,2,,,,,,587,4522,133.00,153.00,297.00,451.00
91 | 2018-01-22 05:00:00,16,16,740.75,10099.43,9356.42,0.00,2,,2,,3,249,166,166,24.126,23.865,,30391,6,104,22789,5,10090,72,,9,2488,72,3,350,9740,5,816,9739,9633,4908,60,598,4234,2,,,,14,,626,5534,133.00,152.00,298.00,451.00
92 | 2018-01-22 05:10:00,25,26,102.86,20550.58,21074.40,0.00,2,,3,,2,249,166,165,23.962,23.374,52,30332,7,97,22643,5,10191,77,,11,2450,115,7,273,9618,5,847,9916,9485,5562,83,818,4653,3,2,,,3,,1012,6574,133.00,150.00,300.00,451.00
93 | ...
94 |
95 | OSD Tree Path,,,,osd.id,Deep-Scrub: OK,Deep-Scrub: Starts,OSD Boot,OSD Down: No PG stats,OSD Down: Reported Failed,OSD Out,OSD Wrongly,Slow Primary,Slow Primary: commit_sent,Slow Primary: no flag points reached,Slow Primary: reached_pg,Slow Primary: started,Slow Primary: waiting for degraded object,Slow Primary: waiting for missing object,Slow Primary: waiting for ondisk,Slow Primary: waiting for rw locks,Slow Primary: waiting for scrub,Slow SubOp,Slow Total
96 | default,row1,rack1,osd-node-003,osd.0,37,37,2,,1,1,,1265,8,69,927,,2,,,259,,,1265
97 | default,row1,rack1,osd-node-003,osd.2,26,26,3,,2,,2,2745,24,39,2046,3,6,,,627,,34081,36826
98 | default,row1,rack1,osd-node-003,osd.3,15,15,3,,2,,2,1547,7,188,1065,3,,,,284,,680,2227
99 | default,row1,rack1,osd-node-003,osd.4,28,29,3,,2,1,1,1590,,1,934,,1,,,654,,35,1625
100 | default,row1,rack1,osd-node-003,osd.5,24,27,2,,1,,1,3038,14,305,1982,,1,,,736,,2,3040
101 | default,row1,rack1,osd-node-003,osd.6,31,31,2,,1,,1,2281,24,505,1063,,,,,689,,,2281
102 | default,row1,rack1,osd-node-003,osd.7,43,43,1,,,,,2744,16,113,1840,5,,,,770,,,2744
103 | default,row1,rack1,osd-node-003,osd.8,33,33,1,,,,,1390,3,26,954,,9,,,398,,,1390
104 | default,row1,rack1,osd-node-003,osd.9,34,34,1,,,,,2883,4,220,1548,,3,,,1108,,,2883
105 | default,row1,rack1,osd-node-004,osd.10,8,8,2,1,,1,,1994,13,1,925,1,1,1,,1052,,,1994
106 | default,row1,rack1,osd-node-004,osd.11,9,9,2,1,,1,,1133,1,,956,,,1,,175,,,1133
107 | ...
108 | default,row1,rack2,osd-node-029,,915,915,31,0,21,4,17,12717,341,2732,7155,32,38,1,0,2418,0,0,12717
109 | default,row1,rack2,osd-node-028,,496,497,26,0,16,2,14,17615,124,2223,12062,14,30,0,0,3162,0,0,17615
110 | default,row1,rack2,osd-node-027,,154,154,20,0,11,3,7,13095,224,1753,7253,12,54,14,0,3785,0,0,13095
111 | default,row1,rack2,osd-node-026,,445,445,22,0,12,3,9,15869,578,3750,7262,19,43,35,0,4182,0,0,15869
112 | default,row1,rack2,osd-node-025,,720,720,18,0,10,3,7,16185,123,1691,9394,14,30,3,0,4930,0,0,16185
113 | default,row1,rack2,osd-node-024,,882,882,24,0,13,4,10,21237,384,3710,8365,25,62,47,0,8643,1,0,21237
114 | default,row1,rack2,osd-node-023,,564,564,19,0,10,1,9,16237,38,1062,11968,1,30,5,0,3133,0,0,16237
115 | default,row1,rack2,osd-node-022,,521,521,18,0,9,1,8,21534,66,1261,14698,11,40,4,0,5454,0,0,21534
116 | ...
117 |
118 | Pool ID,Deep-Scrub: Average,Deep-Scrub: Count,Deep-Scrub: Max,Deep-Scrub: Min,Deep-Scrub: Total,Slow Primary,Slow Primary: commit_sent,Slow Primary: no flag points reached,Slow Primary: reached_pg,Slow Primary: started,Slow Primary: waiting for degraded object,Slow Primary: waiting for missing object,Slow Primary: waiting for ondisk,Slow Primary: waiting for rw locks,Slow Primary: waiting for scrub,Slow SubOp,Slow Total
119 | 1,289.468405,40,507.328,242.775,11578.7,,,,,,,,,,,,
120 | 2,0.137212,3,0.216364,0.035713,0.411637,,,,,,,,,,,,
121 | 3,0.036633,3,0.059088,0.0110841,0.1099,,,,,,,,,,,,
122 | 4,0.080645,3,0.115252,0.0404911,0.241935,221,,2,211,1,7,,,,,,221
123 | 5,0.159889,2,0.2046,0.115178,0.319778,,,,,,,,,,,,
124 | 6,0.065334,4,0.109835,0.029923,0.261337,,,,,,,,,,,,
125 | 7,0.029934,6,0.061202,0.00389504,0.179606,,,,,,,,,,,,
126 | 8,0.054197,3,0.0712051,0.02034,0.162592,,,,,,,,,,,,
127 | 9,209.420344,5061,1365.56,11.3129,1.05988e+06,455401,4806,50870,274801,441,987,246,2,123226,22,69519,524920
128 | 11,16.549503,2698,100.63,0.613281,44650.6,2,2,,,,,,,,,16,18
129 | 12,0.119071,2694,27.8345,0.00160193,320.777,3,,3,,,,,,,,,3
130 | 13,0.130721,3,0.267788,0.0144391,0.392163,,,,,,,,,,,,
131 | ```
132 | ###### Example screenshots from Spreadsheet view:
133 | ###### Time histogram ( 10 minute interval )
134 | 
135 |
136 | ###### OSD Chart with OSD Tree input
137 | 
138 |
139 | ###### Pool chart showing scrub and slow request counters
140 | 
141 |
142 | #### deep-scrub_timing.awk
143 | - Provide the `ceph.log` and this script will provide an output showing the time between the start and stop of every deep-scrub. The output format is csv, with the first column being the deep-scrub time in seconds, second column being the 'deep-scrub' line which stopped the timer. The start/stop lines are keyed on the pg.id. At the end of the processing, a Min,Avg,Max output is also provided, along with the 'deep-scrub' completed line for the Min and Max processing times.
144 |
145 | ###### Example:
146 | ```
147 | # ./deep-scrub_timing.awk /var/log/ceph/ceph.log > ~/deep-scrub_timings.csv
148 | # cat ~/deep-scrub_timings.csv
149 |
150 | 0.0155821,2018-01-16 03:44:06.068707 osd.764 10.129.152.42:6851/3796002 4467 : cluster [INF] 29.243 deep-scrub ok
151 | 0.0110428,2018-01-16 03:44:11.223353 osd.447 10.129.152.33:6851/3784262 4900 : cluster [INF] 29.5ad deep-scrub ok
152 | 0.0009799,2018-01-16 03:45:59.345522 osd.927 10.129.152.50:6836/2106288 6823 : cluster [INF] 20.e9 deep-scrub ok
153 | 0.002249,2018-01-16 03:46:04.488109 osd.284 10.129.152.30:6848/3526172 4303 : cluster [INF] 18.2f deep-scrub ok
154 | 0.000980854,2018-01-16 03:47:26.628785 osd.540 10.129.152.40:6824/4041304 5864 : cluster [INF] 23.238 deep-scrub ok
155 | 0.00139022,2018-01-16 03:47:27.402259 osd.684 10.129.152.42:6818/3777592 5148 : cluster [INF] 17.26d deep-scrub ok
156 | ...
157 | Min,Avg,Max
158 | 0.000564098,248.451,846.795
159 | Min Req: 2018-01-16 11:28:00.908817 osd.4 10.129.152.25:6837/3496196 5784 : cluster [INF] 48.32 deep-scrub ok
160 | Max Req: 2018-01-17 01:13:12.793967 osd.131 10.129.152.23:6814/3605203 3452 : cluster [INF] 30.7f7 deep-scrub ok
161 | ```
162 |
163 | #### iops_histo.sh
164 | - Provide a 'ceph.log', this script will output a CSV file that can be graphed to understand the IOPs histogram for the time covered by the ceph.log. Left column is thousand IOPs, right column is how many 'pgmap' entries fall into that thousand.
165 |
166 | ###### Example:
167 | ```
168 | # ./iops_histo.sh ceph.log > iops_histo.csv
169 | # cat iops_histo.csv
170 |
171 | 0,628
172 | 1,124
173 | 2,1986
174 | 3,8339
175 | 4,4218
176 | 5,3705
177 | 6,3233
178 | 7,2574
179 | 8,2013
180 | 9,1453
181 | 10,890
182 | 11,607
183 | 12,413
184 | 13,349
185 | 14,287
186 | 15,238
187 | 16,252
188 | 17,214
189 | 18,173
190 | ```
191 |
192 | #### map_reporters_to_buckets.sh
193 | - Provide with a ceph-mon.log and text output file from 'ceph osd tree' and this script will generate a mapping of 'reported failed' (reported and reporters) counts as a result.
194 |
195 | ```
196 | # ceph osd tree > ceph_osd_tree.txt
197 | # ./map_reporters_to_buckets.sh ceph-mon.log ceph_osd_tree.txt > reporters.csv
198 | Searching..., mapping to buckets
199 |
200 | # cat reporters.csv
201 | buckets...,reported,reporter
202 | default,rack1,ceph-storage-003,osd.0,2411,1520
203 | default,rack1,ceph-storage-003,osd.6,1880,2198
204 | default,rack1,ceph-storage-003,osd.10,2456,1663
205 | default,rack1,ceph-storage-003,osd.15,1978,2677
206 | ...
207 | default,rack1,ceph-storage-003,24256,22256,
208 | default,rack1,ceph-storage-004,osd.423,3869,1893
209 | default,rack1,ceph-storage-004,osd.425,3024,2832
210 | default,rack1,ceph-storage-004,osd.427,2219,2439
211 | ...
212 | default,rack1,ceph-storage-004,27784,21096,
213 | ...
214 | default,rack1,206045,167742,
215 | ...
216 | default,rack2,199356,137798,
217 | ...
218 | default,rack3,ceph-storage-046,osd.254,34761,46650
219 | default,rack3,ceph-storage-046,osd.259,32485,38331
220 | default,rack3,ceph-storage-046,osd.264,33657,48924
221 | default,rack3,ceph-storage-046,osd.269,31560,48421
222 | default,rack3,ceph-storage-046,309241,409805,
223 | ...
224 | default,rack3,313686,413547,
225 | default,719087,719087,
226 |
227 | ```
228 |
229 | #### rgw_req_timing.sh
230 | - Provide the `radosgw.log` and this script will provide an output showing the time between the start and return of every RGW request. The output format is csv, with the first column being the request time in seconds, second column being the 'req done' line which stopped the timer. The start/stop lines are keyed on the request ID assigned by RGW. At the end of the processing, a Min,Avg,Max output is also provided, along with the 'req done' line for the Min and Max request times.
231 |
232 | ###### Example:
233 | ```
234 | # ./rgw_req_timing.sh /var/log/ceph/ceph-rgw-myhostname.log > ~/req_timings.csv
235 | # cat ~/req_timings.csv
236 |
237 | 0.187219,2018-01-16 03:47:01.622215 2af878cd7700 1 ====== req done req=0x2af878cd1710 op status=0 http_status=200 ======
238 | 0.051897,2018-01-16 03:47:01.989993 2af8a132d700 1 ====== req done req=0x2af8a1327710 op status=0 http_status=200 ======
239 | 0.181928,2018-01-16 03:47:02.045216 2af878cd7700 1 ====== req done req=0x2af878cd1710 op status=0 http_status=200 ======
240 | 0.052496,2018-01-16 03:47:02.047359 2af8a5335700 1 ====== req done req=0x2af8a532f710 op status=0 http_status=200 ======
241 | 0.279186,2018-01-16 03:47:02.207797 2af87e7e5700 1 ====== req done req=0x2af87e7df710 op status=0 http_status=200 ======
242 | 0.16574,2018-01-16 03:47:02.447974 2af878cd7700 1 ====== req done req=0x2af878cd1710 op status=0 http_status=200 ======
243 | 0.29716,2018-01-16 03:47:02.712994 2af87e7e5700 1 ====== req done req=0x2af87e7df710 op status=0 http_status=200 ======
244 | 0.186362,2018-01-16 03:47:02.828799 2af878cd7700 1 ====== req done req=0x2af878cd1710 op status=0 http_status=200 ======
245 | 0.236106,2018-01-16 03:47:02.931637 2af88ab00700 1 ====== req done req=0x2af88aafa710 op status=0 http_status=200 ======
246 | 0.0516322,2018-01-16 03:47:02.952181 2af87f0e7700 1 ====== req done req=0x2af87f0e1710 op status=0 http_status=200 ======
247 | ...
248 | Min,Avg,Max
249 | 0.000127792,0.73737,1200.11
250 | Min Req: 2018-01-16 15:46:07.383273 2af89230f700 1 ====== req done req=0x2af892309710 op status=0 http_status=400 ======
251 | Max Req: 2018-01-16 12:09:07.163211 2af89130d700 1 ====== req done req=0x2af891307710 op status=0 http_status=200 ======
252 | ```
253 |
--------------------------------------------------------------------------------
/ceph_log_parser.awk:
--------------------------------------------------------------------------------
1 | #!/usr/bin/awk -f
2 |
3 | #######################################################
4 | #######################################################
5 | ##
6 | ## Run with ceph.log and redirect output to a CSV
7 | ##
8 | ## ./ceph_log_parser.awk ceph.log > ceph-log-parsed.csv
9 | ## ./ceph_log_parser.awk -v osdtree=ceph_osd_tree.txt -v timeinterval=60 -v bucketsummary=1 ceph.log > ceph-log-parsed.csv
10 | ##
11 | ##
12 | ## Available options:
13 | ## -v osdtree=ceph_osd_tree.txt
14 | ## If provided, the osd output portion will be output with its branch path in the crushmap
15 | ##
16 | ## -v timeinterval=(1|10|60|day)
17 | ## If provided, adjusts the time alignment for the histogram output. Default is 10 (minutes)
18 | ##
19 | ## -v bucketsummary=1
20 | ## If provided, provides an output below the OSD data summarizing the OSD counts for each
21 | ## successive bucket branch above the OSD ( example: host, rack, row, root )
22 | ## Default is 1 if 'osdtree' is defined.
23 | ##
24 | ## -v osdhisto=1
25 | ## Provides a column per OSD in the time histogram showing initial 'slow request' entries
26 | ## incurred by that OSD during the time interval.
27 | ## Default is disabled because this can make VERY wide spreadsheets
28 | ##
29 | ## NOTE: These options MUST be specified **BEFORE** the ceph.log file, otherwise they will be
30 | ## ignored
31 | ##
32 | ## * For items which are average, these are summed and averaged over the measurement interval
33 | ## The measurement is reported at the beginning of the interval measurement period
34 | ## e.g IO: Client Read MB/s for 03:30 to 03:40 is averaged, then reported on the 03:30 line
35 | ##
36 | ## * For items which are a static snapshot, these are reported based on the last line containing those
37 | ## details in the log before the end of the measurement interval
38 | ## e.g. PG: active for 03:30 to 03:40 - If a pgmap is found at 03:39:59, that will be the one reported for
39 | ## the 03:30 line
40 | ##
41 | ## * For items like the Slow requests, the count of those entries is summed during the 10 minute period and reported
42 | ## e.g. If there are 50 'slow request ' logs in the 10 minute interval which are for a primary OSD, then 50 is reported
43 | ## If there are 50 'slow request ' logs 'waiting for subop', then the OSDs called out by the subop (comma
44 | ## separated numbers), are all counted in the 'Slow SubOp' line. For 3x replication, and 50 lines, the reported
45 | ## number would be 100 (due to 2x non-primary copies * 50 lines)
46 | ##
47 | ##
48 | #######################################################
49 | #######################################################
50 |
51 |
52 |
53 | function toMB(mynum,myunit) {
54 | myunit=tolower(myunit)
55 | if (myunit ~ /^b/) { mynum/=(1024*1024); }
56 | else if (myunit ~ /^kb/) { mynum/=1024; }
57 | else if (myunit ~ /^gb/) { mynum*=1024; }
58 | else if (myunit ~ /^tb/) { mynum*=1024*1024; }
59 | return sprintf("%0.2f",mynum)
60 | }
61 |
62 | function toTB(mynum,myunit) {
63 | myunit=tolower(myunit)
64 | if (myunit ~ /^b/) { mynum/=(1024*1024*1024*1024) }
65 | else if (myunit ~ /^kb/) { mynum/=(1024*1024*1024) }
66 | else if (myunit ~ /^mb/) { mynum/=(1024*1024) }
67 | else if (myunit ~ /^gb/) { mynum/=1024 }
68 | else if (myunit ~ /^pb/) { mynum*=1024 }
69 | else if (myunit ~ /^eb/) { mynum*=1024*1024 }
70 | return sprintf("%0.2f",mynum)
71 | }
72 |
73 | function join(array,sep) {
74 | if(1 in array) {
75 | result=array[1]
76 | arraylen=length(array)
77 | if(arraylen>1) {
78 | for(z=2;z<=arraylen;z++)
79 | result = result sep array[z]
80 | }
81 | }
82 | return result
83 | }
84 |
85 | function procbranch(myline) {
86 | split(myline,lineparts," ")
87 | if(lineparts[3] in branchtype) {
88 | if(currentdepth>branchtype[lineparts[3]]) {
89 | for(i=currentdepth;i>branchtype[lineparts[3]];i--) {
90 | delete prefix[i]
91 | delete branchtype[i]
92 | }
93 | delete prefix[branchtype[lineparts[3]]]
94 | }
95 | } else {
96 | currentdepth++
97 | branchtype[lineparts[3]]=currentdepth
98 | }
99 | prefix[branchtype[lineparts[3]]]=lineparts[4]
100 | wasinhost=0
101 | }
102 |
103 | function procosd(myline) {
104 | split(myline,lineparts," ")
105 | outline=join(prefix,",")
106 | if(classenable==1)
107 | outline=outline","lineparts[2]
108 | osdpaths[lineparts[osdoffset]]=outline
109 | outline=outline","lineparts[osdoffset]
110 | osdpathsbypath[outline]=lineparts[osdoffset]
111 | if(currentdepth>maxpathdepth)
112 | maxpathdepth=currentdepth
113 | }
114 |
115 | function histoevent(mykey,myevent,myfunc,myvalue) {
116 | EVENTHEADERS[myevent]=1
117 | if(myfunc=="sum")
118 | EVENTCOUNT[mykey][myevent]+=myvalue
119 | else if(myfunc=="set")
120 | EVENTCOUNT[mykey][myevent]=myvalue
121 | else if(myfunc=="inc")
122 | EVENTCOUNT[mykey][myevent]++
123 | }
124 |
125 | function histototal(myevent,myvalue) {
126 | EVENTTOTAL[myevent]+=myvalue
127 | }
128 |
129 | function osdhistoevent(mykey,myevent,myfunc,myvalue) {
130 | if(osdhisto!="") {
131 | OSDEVENTHEADERS[myevent]=1
132 | if(myfunc=="sum")
133 | OSDEVENTCOUNT[mykey][myevent]+=myvalue
134 | else if(myfunc=="set")
135 | OSDEVENTCOUNT[mykey][myevent]=myvalue
136 | else if(myfunc=="inc")
137 | OSDEVENTCOUNT[mykey][myevent]++
138 | }
139 | }
140 |
141 | function osdhistototal(myevent,myvalue) {
142 | if(osdhisto!="")
143 | OSDEVENTTOTAL[myevent]+=myvalue
144 | }
145 |
146 | function osdevent(mykey,myevent,myfunc,myvalue) {
147 | OSDHEADERS[myevent]=1
148 | if(myfunc=="sum")
149 | OSDEVENT[mykey][myevent]+=myvalue
150 | else if(myfunc=="set")
151 | OSDEVENT[mykey][myevent]=myvalue
152 | else if(myfunc=="inc")
153 | OSDEVENT[mykey][myevent]++
154 | }
155 |
156 | function osdtotal(myevent,myvalue) {
157 | OSDTOTAL[myevent]+=myvalue
158 | }
159 |
160 | function poolevent(mykey,myevent,myfunc,myvalue) {
161 | POOLHEADERS[myevent]=1
162 | if(myfunc=="sum")
163 | POOLEVENT[mykey][myevent]+=myvalue
164 | else if(myfunc=="set")
165 | POOLEVENT[mykey][myevent]=myvalue
166 | else if(myfunc=="inc")
167 | POOLEVENT[mykey][myevent]++
168 | else if(myfunc=="max") {
169 | if(myvalue>POOLEVENT[pgparts[1]][myevent] || POOLEVENT[pgparts[1]][myevent] == "")
170 | POOLEVENT[pgparts[1]][myevent]=myvalue
171 | } else if(myfunc=="min") {
172 | if(myvalue 0 ) {
199 | split(line,osdtreeparts," ")
200 | switch (osdtreeparts[1]) {
201 | case "ID":
202 | classenable=0
203 | osdoffset=3
204 | if(osdtreeparts[2]=="CLASS") {
205 | classenable=1
206 | osdoffset=4
207 | }
208 | break
209 | case /^ *-/:
210 | procbranch(line)
211 | break
212 | case /^ *[0-9]/:
213 | wasinhost=1
214 | procosd(line)
215 | break
216 | }
217 | }
218 | }
219 | }
220 |
221 | / HEALTH_/ {
222 | MYDTSTAMP=mydtstamp($1" "$2)
223 | myline=$0
224 | gsub(";","",$9)
225 | histoevent(MYDTSTAMP,$9,"inc")
226 | myeventadd=0
227 | split(myline,mylineparts,"; ")
228 |
229 | for(linepartindex in mylineparts) {
230 | if(mylineparts[linepartindex] ~ /flag\(s\) set/) {
231 | split(mylineparts[linepartindex],linespaced," ")
232 | split(linespaced[1],flags,",")
233 | for(flagidx in flags) {
234 | histoevent(MYDTSTAMP,"Flag: "flags[flagidx],"inc")
235 | }
236 | }
237 | }
238 |
239 | }
240 |
241 | / pgmap / {
242 | MYDTSTAMP=mydtstamp($1" "$2)
243 | myline=$0
244 | myeventadd=0
245 | split(myline,mylineparts,";")
246 |
247 | for(linepartindex in mylineparts) {
248 | switch (mylineparts[linepartindex]) {
249 | case / pgs: /:
250 | split(mylineparts[linepartindex],junka,":")
251 | split(junka[7],pgstats,",")
252 |
253 | # Reset the counts so that only the last line in a measured interval is accumulated
254 | if(MYDTSTAMP in EVENTCOUNT) {
255 | for(key in EVENTCOUNT[MYDTSTAMP])
256 | if(key ~ /^PG: /)
257 | delete EVENTCOUNT[MYDTSTAMP][key]
258 | }
259 |
260 | for(pgstatindex in pgstats) {
261 | pgstat=pgstats[pgstatindex]
262 | split(pgstat,statparts," ")
263 | split(statparts[2],pgstate,"+")
264 | for(pgstateindex in pgstate) {
265 | myeventname="PG: "pgstate[pgstateindex]
266 | histoevent(MYDTSTAMP,myeventname,"sum",statparts[1])
267 | }
268 | }
269 | break
270 | case / avail$/:
271 | split(mylineparts[linepartindex],clusterspace,",")
272 | for(spaceindex in clusterspace) {
273 | split(clusterspace[spaceindex],myspaceparts," ")
274 | if(myspaceparts[3] ~ /^data/) {
275 | histoevent(MYDTSTAMP,"Space (TB): Data Stored","set",toTB(myspaceparts[1],myspaceparts[2]))
276 | } else if(myspaceparts[3] ~ /^used/) {
277 | histoevent(MYDTSTAMP,"Space (TB): Raw Used","set",toTB(myspaceparts[1],myspaceparts[2]))
278 | } else if(6 in myspaceparts) {
279 | histoevent(MYDTSTAMP,"Space (TB): Free","set",toTB(myspaceparts[1],myspaceparts[2]))
280 | histoevent(MYDTSTAMP,"Space (TB): Total","set",toTB(myspaceparts[4],myspaceparts[5]))
281 | }
282 | }
283 | break
284 | case /op\/s/:
285 | split(mylineparts[linepartindex],clilineparts,",")
286 | for(clilpindex in clilineparts) {
287 | split(clilineparts[clilpindex],mycliparts," ")
288 | if(3 in mycliparts) {
289 | myeventadd=toMB(mycliparts[1],mycliparts[2])
290 | if(mycliparts[3] ~ /^rd/) {
291 | myeventname="IO: Client Avg Read MB/s"
292 | myeventcount="Client Read Count"
293 | }
294 | else if(mycliparts[3] ~ /^wr/) {
295 | myeventname="IO: Client Avg Write MB/s"
296 | myeventcount="Client Write Count"
297 | }
298 | } else {
299 | myeventname="IO: Client Avg IOPs"
300 | myeventadd=mycliparts[1]
301 | myeventcount="Client IOPsCount"
302 | }
303 | histoevent(MYDTSTAMP,myeventname,"set",sprintf("%0.2f",((EVENTCOUNT[MYDTSTAMP][myeventname]*EVENTCOUNT[MYDTSTAMP][myeventcount])+myeventadd)/(EVENTCOUNT[MYDTSTAMP][myeventcount]+1)))
304 | EVENTCOUNT[MYDTSTAMP][myeventcount]++
305 | }
306 | break
307 | case / objects degraded /:
308 | split(mylineparts[linepartindex],degradeobj," ")
309 | gsub(/[^0-9\.]/,"",degradeobj[4])
310 | histoevent(MYDTSTAMP,"Objects: Degraded Percent","set",degradeobj[4])
311 | break
312 | case / objects misplaced /:
313 | split(mylineparts[linepartindex],degradeobj," ")
314 | gsub(/[^0-9\.]/,"",degradeobj[4])
315 | histoevent(MYDTSTAMP,"Objects: Misplaced Percent","set",degradeobj[4])
316 | break
317 | case / recovering$/:
318 | myeventname="IO: Recovery Avg MB/s"
319 | myeventcount="RecoveryCount"
320 | split(mylineparts[linepartindex],reclineparts," ")
321 | myeventadd=toMB(reclineparts[1],reclineparts[2])
322 | histoevent(MYDTSTAMP,myeventname,"set",sprintf("%0.2f",((EVENTCOUNT[MYDTSTAMP][myeventname]*EVENTCOUNT[MYDTSTAMP][myeventcount])+myeventadd)/(EVENTCOUNT[MYDTSTAMP][myeventcount]+1)))
323 | EVENTCOUNT[MYDTSTAMP][myeventcount]++
324 | break
325 | }
326 | }
327 | }
328 |
329 | / deep-scrub / {
330 | MYDTSTAMP=mydtstamp($1" "$2)
331 | MYPG=$9
332 | MYDATE=$1
333 | MYTIME=$2
334 | gsub(/[-:]/," ",MYDATE)
335 | gsub(/[-:]/," ",MYTIME)
336 | MYTIME=mktime(MYDATE" "MYTIME)
337 | split($2,secs,".")
338 | millisecs=sprintf("0.%s",secs[2])
339 | MYTIME+=millisecs
340 |
341 | if($NF == "starts") {
342 | MYEVENT="Deep-Scrub: Starts"
343 | histoevent(MYDTSTAMP,MYEVENT,"inc")
344 | osdevent($3,MYEVENT,"inc")
345 | osdtotal(MYEVENT,1)
346 | histototal(MYEVENT,1)
347 | MYSTART[MYPG]=MYTIME
348 | }
349 | else {
350 | if(MYSTART[MYPG]!="") {
351 | mydiff=MYTIME-MYSTART[MYPG]
352 | split(MYPG,pgparts,".")
353 | poolevent(pgparts[1],"Deep-Scrub: Count","inc")
354 | poolevent(pgparts[1],"Deep-Scrub: Total","sum",mydiff)
355 | poolevent(pgparts[1],"Deep-Scrub: Min","min",mydiff)
356 | poolevent(pgparts[1],"Deep-Scrub: Max","max",mydiff)
357 | }
358 | if($NF == "ok") {
359 | MYEVENT="Deep-Scrub: OK"
360 | histoevent(MYDTSTAMP,MYEVENT,"inc")
361 | histototal(MYEVENT,1)
362 | osdevent($3,MYEVENT,"inc")
363 | osdtotal(MYEVENT,1)
364 | } else {
365 | MYEVENT="Deep-Scrub: Not OK"
366 | histoevent(MYDTSTAMP,MYEVENT,"inc")
367 | histototal(MYEVENT,1)
368 | osdevent($3,MYEVENT,"inc")
369 | osdtotal(MYEVENT,1)
370 | }
371 | }
372 | }
373 |
374 | /slow request / {
375 | MYDTSTAMP=mydtstamp($1" "$2)
376 | MYLINE=$0
377 | split(MYLINE,myparts,":")
378 | split(myparts[9],opparts," ")
379 | if (opparts[2] ~ /^[0-9]*\.[0-9a-f]*$/)
380 | split(opparts[2],pgid,".")
381 | else if (opparts[9] ~ /^[0-9]*\.[0-9a-f]*$/)
382 | split(opparts[9],pgid,".")
383 |
384 | if ($0 ~ /subops from/) {
385 | split($NF,subosds,",")
386 | for (subosdidx in subosds) {
387 | subosd="osd."subosds[subosdidx]
388 | if($11 < 60) {
389 | myeventstring="Slow SubOp,Slow Total"
390 | osdhistoevent(MYDTSTAMP,subosd,"inc")
391 | osdhistototal(subosd,"inc")
392 | } else {
393 | myeventstring="Slow Relog SubOp,Slow Relog Total"
394 | }
395 | split(myeventstring,myevents,",")
396 | for(myevent in myevents) {
397 | histototal(myevents[myevent],1)
398 | histoevent(MYDTSTAMP,myevents[myevent],"inc")
399 | osdevent(subosd,myevents[myevent],"inc")
400 | osdtotal(myevents[myevent],1)
401 | poolevent(pgid[1],myevents[myevent],"inc")
402 | }
403 | }
404 | } else {
405 | MYTYPE=$0
406 | mytpartcount=split($0,mytparts," currently ")
407 | MYTYPE="Slow Primary: "mytparts[mytpartcount]
408 | if($11 < 60) {
409 | myeventstring="Slow Primary,Slow Total,"MYTYPE
410 | osdhistoevent(MYDTSTAMP,$3,"inc")
411 | osdhistototal($3,"inc")
412 | } else {
413 | myeventstring="Slow Relog Primary,Slow Relog Total"
414 | }
415 | split(myeventstring,myevents,",")
416 | for(myevent in myevents) {
417 | histoevent(MYDTSTAMP,myevents[myevent],"inc")
418 | histototal(myevents[myevent],1)
419 | osdevent($3,myevents[myevent],"inc")
420 | osdtotal(myevents[myevent],1)
421 | poolevent(pgid[1],myevents[myevent],"inc")
422 | }
423 | }
424 | }
425 |
426 | / osdmap / {
427 | MYDTSTAMP=mydtstamp($1" "$2)
428 | histoevent(MYDTSTAMP,"OSDs","set",$11)
429 | histoevent(MYDTSTAMP,"OSDs UP","set",$13)
430 | histoevent(MYDTSTAMP,"OSDs IN","set",$15)
431 | }
432 |
433 | / osd\.[0-9]* out / {
434 | MYDTSTAMP=mydtstamp($1" "$2)
435 | MYEVENT="OSD Out"
436 | histoevent(MYDTSTAMP,MYEVENT,"inc")
437 | histototal(MYEVENT,1)
438 | osdevent($9,MYEVENT,"inc")
439 | osdtotal(MYEVENT,1)
440 | }
441 |
442 | / wrongly marked me down$/ {
443 | MYDTSTAMP=mydtstamp($1" "$2)
444 | MYEVENT="OSD Wrongly"
445 | histoevent(MYDTSTAMP,MYEVENT,"inc")
446 | histototal(MYEVENT,1)
447 | osdevent($3,MYEVENT,"inc")
448 | osdtotal(MYEVENT,1)
449 | }
450 |
451 | / marked itself down/ {
452 | MYDTSTAMP=mydtstamp($1" "$2)
453 | MYEVENT="OSD Down: Self"
454 | histoevent(MYDTSTAMP,MYEVENT,"inc")
455 | histototal(MYEVENT,1)
456 | osdevent($9,MYEVENT,"inc")
457 | osdtotal(MYEVENT,1)
458 | }
459 |
460 | / failed .*reports from / {
461 | MYDTSTAMP=mydtstamp($1" "$2)
462 | MYEVENT="OSD Down: Reported Failed"
463 | histoevent(MYDTSTAMP,MYEVENT,"inc")
464 | histototal(MYEVENT,1)
465 | osdevent($9,MYEVENT,"inc")
466 | osdtotal(MYEVENT,1)
467 | }
468 |
469 | / marked down after no pg stats for / {
470 | MYDTSTAMP=mydtstamp($1" "$2)
471 | MYEVENT="OSD Down: No PG stats"
472 | histoevent(MYDTSTAMP,MYEVENT,"inc")
473 | histototal(MYEVENT,1)
474 | osdevent($9,MYEVENT,"inc")
475 | osdtotal(MYEVENT,1)
476 | }
477 |
478 | / boot$/ {
479 | MYDTSTAMP=mydtstamp($1" "$2)
480 | MYEVENT="OSD Boot"
481 | histoevent(MYDTSTAMP,MYEVENT,"inc")
482 | histototal(MYEVENT,1)
483 | osdevent($9,MYEVENT,"inc")
484 | osdtotal(MYEVENT,1)
485 | }
486 |
487 | END {
488 |
489 | ## Begin outputting the histogram chart
490 | printf("DateTime")
491 | n=asorti(EVENTHEADERS)
492 | if(osdhisto!="")
493 | osdn=asorti(OSDEVENTHEADERS)
494 | for (i = 1; i<= n; i++ )
495 | printf(",%s",EVENTHEADERS[i])
496 | if(osdhisto!="") {
497 | for (i = 1; i<= osdn; i++)
498 | printf(",%s",OSDEVENTHEADERS[i])
499 | }
500 |
501 | printf("\n")
502 |
503 | dtcount=asorti(EVENTCOUNT,DTS)
504 |
505 | for (dtindex =1; dtindex <= dtcount; dtindex++) {
506 | DT=DTS[dtindex]
507 | printf("%s:00", DT)
508 | for (i = 1; i<= n; i++ )
509 | printf(",%s",EVENTCOUNT[DT][EVENTHEADERS[i]])
510 | if(osdhisto!="") {
511 | # add-on the per OSD histo columns
512 | for (i = 1; i<= osdn; i++ )
513 | printf(",%s",OSDEVENTCOUNT[DT][OSDEVENTHEADERS[i]])
514 | }
515 | printf("\n")
516 | }
517 |
518 | ## Begin outputting the column totals line
519 | printf("Totals")
520 | for (i = 1; i<= n; i++ )
521 | printf(",%s",EVENTTOTAL[EVENTHEADERS[i]])
522 | if(osdhisto!="") {
523 | for (i = 1; i<= osdn; i++ )
524 | printf(",%s",OSDEVENTTOTAL[OSDEVENTHEADERS[i]])
525 | }
526 |
527 | printf("\n")
528 | printf("\n")
529 |
530 | ## Begin outputting the OSD chart
531 | o=asorti(OSDHEADERS,OHDR)
532 |
533 | if(osdtree != "") {
534 | printf("OSD Tree Path,")
535 | for(pathindex=2;pathindex<=maxpathdepth;pathindex++)
536 | printf(",")
537 | }
538 |
539 | printf("osd.id")
540 | for (i = 1; i<= o; i++ ) {
541 | printf(",%s",OHDR[i])
542 | }
543 | printf("\n")
544 |
545 | if(osdtree=="") {
546 | for (OSD in OSDEVENT) {
547 | gsub(/^osd\./,"",OSD)
548 | OSDS[OSD]=OSD
549 | }
550 | osdcount=asort(OSDS)
551 | } else {
552 | osdcount=asorti(osdpathsbypath,OSDS)
553 | }
554 | for (osdindex=1; osdindex<=osdcount; osdindex++) {
555 | if(osdtree=="")
556 | osd="osd."OSDS[osdindex]
557 | else {
558 | osd=OSDS[osdindex]
559 | split(OSDS[osdindex],osdparts,",")
560 | osd=osdparts[length(osdparts)]
561 |
562 | printf("%s,",osdpaths[osd])
563 | split(osdpaths[osd],pathjunk,",")
564 | pathdepth=length(pathjunk)
565 | if(pathdepth=1;bindex--) {
608 | printf("%s,",BKS[bindex])
609 | split(BKS[bindex],bucketjunk,",")
610 | junklen=length(bucketjunk)
611 | for(i=junklen; i< maxpathdepth; i++)
612 | printf(",")
613 | for (i = 1; i<= o; i++ ) {
614 | if(BUCKETSUMMARY[BKS[bindex]][OHDR[i]]>0)
615 | printf(",%s",BUCKETSUMMARY[BKS[bindex]][OHDR[i]])
616 | else
617 | printf(",")
618 | }
619 | printf("\n")
620 | }
621 | } else {
622 | ## Or print column totals if Bucket Summary is not selected
623 | printf("Totals")
624 | if(osdtree != "") {
625 | for(pathindex=2;pathindex<=maxpathdepth;pathindex++)
626 | printf(",")
627 | }
628 | for (i = 1; i<= o; i++ ) {
629 | printf(",%s",OSDTOTAL[OHDR[i]])
630 | }
631 | }
632 |
633 | printf("\n\n")
634 |
635 | ## Begin outputting the Pool summary chart
636 | POOLHEADERS["Deep-Scrub: Average"]=1
637 | poolcount=asorti(POOLEVENT,poolids)
638 | phdrcount=asorti(POOLHEADERS,PHDR)
639 | printf("Pool ID")
640 | for(phdrindex=1;phdrindex<=phdrcount;phdrindex++)
641 | printf(",%s",PHDR[phdrindex])
642 | printf("\n")
643 | for(pindex=1;pindex<=poolcount;pindex++) {
644 | printf("%s",poolids[pindex])
645 | for(phdrindex=1;phdrindex<=phdrcount;phdrindex++) {
646 | if(PHDR[phdrindex]=="Deep-Scrub: Average") {
647 | if(POOLEVENT[poolids[pindex]]["Deep-Scrub: Count"])
648 | printf(",%0.6f",POOLEVENT[poolids[pindex]]["Deep-Scrub: Total"]/POOLEVENT[poolids[pindex]]["Deep-Scrub: Count"])
649 | else
650 | printf(",")
651 | } else
652 | printf(",%s",POOLEVENT[poolids[pindex]][PHDR[phdrindex]])
653 | }
654 | printf("\n")
655 | }
656 | }
657 |
658 |
659 |
--------------------------------------------------------------------------------
/ceph_log_parser.luminous.awk:
--------------------------------------------------------------------------------
1 | #!/usr/bin/awk -f
2 |
3 | #######################################################
4 | #######################################################
5 | ##
6 | ## Run with ceph.log and redirect output to a CSV
7 | ##
8 | ## ./ceph_log_parser.awk ceph.log > ceph-log-parsed.csv
9 | ## ./ceph_log_parser.awk -v osdtree=ceph_osd_tree.txt -v timeinterval=60 -v bucketsummary=1 ceph.log > ceph-log-parsed.csv
10 | ##
11 | ##
12 | ## Available options:
13 | ## -v osdtree=ceph_osd_tree.txt
14 | ## If provided, the osd output portion will be output with its branch path in the crushmap
15 | ##
16 | ## -v timeinterval=(1|10|60|day)
17 | ## If provided, adjusts the time alignment for the histogram output. Default is 10 (minutes)
18 | ##
19 | ## -v bucketsummary=1
20 | ## If provided, provides an output below the OSD data summarizing the OSD counts for each
21 | ## successive bucket branch above the OSD ( example: host, rack, row, root )
22 | ## Default is 1 if 'osdtree' is defined.
23 | ##
24 | ## -v osdhisto=1
25 | ## Provides a column per OSD in the time histogram showing initial 'slow request' entries
26 | ## incurred by that OSD during the time interval.
27 | ## Default is disabled because this can make VERY wide spreadsheets
28 | ##
29 | ## NOTE: These options MUST be specified **BEFORE** the ceph.log file, otherwise they will be
30 | ## ignored
31 | ##
32 | ## * For items which are average, these are summed and averaged over the measurement interval
33 | ## The measurement is reported at the beginning of the interval measurement period
34 | ## e.g IO: Client Read MB/s for 03:30 to 03:40 is averaged, then reported on the 03:30 line
35 | ##
36 | ## * For items which are a static snapshot, these are reported based on the last line containing those
37 | ## details in the log before the end of the measurement interval
38 | ## e.g. PG: active for 03:30 to 03:40 - If a pgmap is found at 03:39:59, that will be the one reported for
39 | ## the 03:30 line
40 | ##
41 | ## * For items like the Slow requests, the count of those entries is summed during the 10 minute period and reported
42 | ## e.g. If there are 50 'slow request ' logs in the 10 minute interval which are for a primary OSD, then 50 is reported
43 | ## If there are 50 'slow request ' logs 'waiting for subop', then the OSDs called out by the subop (comma
44 | ## separated numbers), are all counted in the 'Slow SubOp' line. For 3x replication, and 50 lines, the reported
45 | ## number would be 100 (due to 2x non-primary copies * 50 lines)
46 | ##
47 | ##
48 | #######################################################
49 | #######################################################
50 |
51 |
52 |
53 | function toMB(mynum,myunit) {
54 | myunit=tolower(myunit)
55 | if (myunit ~ /^b/) { mynum/=(1024*1024); }
56 | else if (myunit ~ /^kb/) { mynum/=1024; }
57 | else if (myunit ~ /^gb/) { mynum*=1024; }
58 | else if (myunit ~ /^tb/) { mynum*=1024*1024; }
59 | return sprintf("%0.2f",mynum)
60 | }
61 |
62 | function toTB(mynum,myunit) {
63 | myunit=tolower(myunit)
64 | if (myunit ~ /^b/) { mynum/=(1024*1024*1024*1024) }
65 | else if (myunit ~ /^kb/) { mynum/=(1024*1024*1024) }
66 | else if (myunit ~ /^mb/) { mynum/=(1024*1024) }
67 | else if (myunit ~ /^gb/) { mynum/=1024 }
68 | else if (myunit ~ /^pb/) { mynum*=1024 }
69 | else if (myunit ~ /^eb/) { mynum*=1024*1024 }
70 | return sprintf("%0.2f",mynum)
71 | }
72 |
73 | function join(array,sep) {
74 | if(1 in array) {
75 | result=array[1]
76 | arraylen=length(array)
77 | if(arraylen>1) {
78 | for(z=2;z<=arraylen;z++)
79 | result = result sep array[z]
80 | }
81 | }
82 | return result
83 | }
84 |
85 | function procbranch(myline) {
86 | split(myline,lineparts," ")
87 | if(lineparts[3] in branchtype) {
88 | if(currentdepth>branchtype[lineparts[3]]) {
89 | for(i=currentdepth;i>branchtype[lineparts[3]];i--) {
90 | delete prefix[i]
91 | delete branchtype[i]
92 | }
93 | delete prefix[branchtype[lineparts[3]]]
94 | }
95 | } else {
96 | currentdepth++
97 | branchtype[lineparts[3]]=currentdepth
98 | }
99 | prefix[branchtype[lineparts[3]]]=lineparts[4]
100 | wasinhost=0
101 | }
102 |
103 | function procosd(myline) {
104 | split(myline,lineparts," ")
105 | outline=join(prefix,",")
106 | if(classenable==1)
107 | outline=outline","lineparts[2]
108 | osdpaths[lineparts[osdoffset]]=outline
109 | outline=outline","lineparts[osdoffset]
110 | osdpathsbypath[outline]=lineparts[osdoffset]
111 | if(currentdepth>maxpathdepth)
112 | maxpathdepth=currentdepth
113 | }
114 |
115 | function histoevent(mykey,myevent,myfunc,myvalue) {
116 | EVENTHEADERS[myevent]=1
117 | if(myfunc=="sum")
118 | EVENTCOUNT[mykey][myevent]+=myvalue
119 | else if(myfunc=="set")
120 | EVENTCOUNT[mykey][myevent]=myvalue
121 | else if(myfunc=="inc")
122 | EVENTCOUNT[mykey][myevent]++
123 | }
124 |
125 | function histototal(myevent,myvalue) {
126 | EVENTTOTAL[myevent]+=myvalue
127 | }
128 |
129 | function osdhistoevent(mykey,myevent,myfunc,myvalue) {
130 | if(osdhisto!="") {
131 | OSDEVENTHEADERS[myevent]=1
132 | if(myfunc=="sum")
133 | OSDEVENTCOUNT[mykey][myevent]+=myvalue
134 | else if(myfunc=="set")
135 | OSDEVENTCOUNT[mykey][myevent]=myvalue
136 | else if(myfunc=="inc")
137 | OSDEVENTCOUNT[mykey][myevent]++
138 | }
139 | }
140 |
141 | function osdhistototal(myevent,myvalue) {
142 | if(osdhisto!="")
143 | OSDEVENTTOTAL[myevent]+=myvalue
144 | }
145 |
146 | function osdevent(mykey,myevent,myfunc,myvalue) {
147 | OSDHEADERS[myevent]=1
148 | if(myfunc=="sum")
149 | OSDEVENT[mykey][myevent]+=myvalue
150 | else if(myfunc=="set")
151 | OSDEVENT[mykey][myevent]=myvalue
152 | else if(myfunc=="inc")
153 | OSDEVENT[mykey][myevent]++
154 | }
155 |
156 | function osdtotal(myevent,myvalue) {
157 | OSDTOTAL[myevent]+=myvalue
158 | }
159 |
160 | function poolevent(mykey,myevent,myfunc,myvalue) {
161 | POOLHEADERS[myevent]=1
162 | if(myfunc=="sum")
163 | POOLEVENT[mykey][myevent]+=myvalue
164 | else if(myfunc=="set")
165 | POOLEVENT[mykey][myevent]=myvalue
166 | else if(myfunc=="inc")
167 | POOLEVENT[mykey][myevent]++
168 | else if(myfunc=="max") {
169 | if(myvalue>POOLEVENT[pgparts[1]][myevent] || POOLEVENT[pgparts[1]][myevent] == "")
170 | POOLEVENT[pgparts[1]][myevent]=myvalue
171 | } else if(myfunc=="min") {
172 | if(myvalue 0 ) {
199 | split(line,osdtreeparts," ")
200 | switch (osdtreeparts[1]) {
201 | case "ID":
202 | classenable=0
203 | osdoffset=3
204 | if(osdtreeparts[2]=="CLASS") {
205 | classenable=1
206 | osdoffset=4
207 | }
208 | break
209 | case /^ *-/:
210 | procbranch(line)
211 | break
212 | case /^ *[0-9]/:
213 | wasinhost=1
214 | procosd(line)
215 | break
216 | }
217 | }
218 | }
219 | }
220 |
221 | / overall HEALTH/ {
222 | if($NF == "HEALTH_OK")
223 | next
224 | MYDTSTAMP=mydtstamp($1" "$2)
225 | myline=$0
226 | myeventadd=0
227 | split(myline,mlpa," : ")
228 | split(mlpa[2],mylineparts,";")
229 |
230 | for(linepartindex in mylineparts) {
231 | switch (mylineparts[linepartindex]) {
232 | case / osds down$/:
233 | split(mylineparts[linepartindex],osdparts," ")
234 | histoevent(MYDTSTAMP,"OSDs down","set",osdparts[5])
235 | break
236 | case / host.*down$/:
237 | split(mylineparts[linepartindex],hostparts," ")
238 | histoevent(MYDTSTAMP,"HOSTs down","set",hostparts[1])
239 | break
240 | case /Reduced data availability: /:
241 | case /Possible data damage: /:
242 | split(mylineparts[linepartindex],linepartA,":")
243 | split(linepartA[2],linepartB,",")
244 | for(field in linepartB) {
245 | split(linepartB[field],fparts," ")
246 | myevent="PG: "fparts[3]
247 | histoevent(MYDTSTAMP,myevent,"set",fparts[1])
248 | }
249 | break
250 | case /Degraded data redundancy: /:
251 | split(mylineparts[linepartindex],linepartA,":")
252 | split(linepartA[2],linepartB,",")
253 | for(field in linepartB) {
254 | if(linepartB[field] ~ /objects degraded/) {
255 | split(linepartB[field],linepartC," ")
256 | gsub(/[^0-9\.]/,"",linepartC[4])
257 | histoevent(MYDTSTAMP,"Objects: Degraded Percent","set",linepartC[4])
258 | } else {
259 | split(linepartB[field],fparts," ")
260 | myevent="PG: "fparts[3]
261 | histoevent(MYDTSTAMP,myevent,"set",fparts[1])
262 | }
263 | }
264 | break
265 | case / objects misplaced /:
266 | split(mylineparts[linepartindex],degradeobj," ")
267 | gsub(/[^0-9\.]/,"",degradeobj[4])
268 | histoevent(MYDTSTAMP,"Objects: Misplaced Percent","set",degradeobj[4])
269 | break
270 | }
271 | }
272 | }
273 |
274 | / deep-scrub / {
275 | MYDTSTAMP=mydtstamp($1" "$2)
276 | MYPG=$9
277 | MYDATE=$1
278 | MYTIME=$2
279 | gsub(/[-:]/," ",MYDATE)
280 | gsub(/[-:]/," ",MYTIME)
281 | MYTIME=mktime(MYDATE" "MYTIME)
282 | split($2,secs,".")
283 | millisecs=sprintf("0.%s",secs[2])
284 | MYTIME+=millisecs
285 |
286 | if($NF == "starts") {
287 | MYEVENT="Deep-Scrub: Starts"
288 | histoevent(MYDTSTAMP,MYEVENT,"inc")
289 | osdevent($3,MYEVENT,"inc")
290 | osdtotal(MYEVENT,1)
291 | histototal(MYEVENT,1)
292 | MYSTART[MYPG]=MYTIME
293 | }
294 | else {
295 | if(MYSTART[MYPG]!="") {
296 | mydiff=MYTIME-MYSTART[MYPG]
297 | split(MYPG,pgparts,".")
298 | poolevent(pgparts[1],"Deep-Scrub: Count","inc")
299 | poolevent(pgparts[1],"Deep-Scrub: Total","sum",mydiff)
300 | poolevent(pgparts[1],"Deep-Scrub: Min","min",mydiff)
301 | poolevent(pgparts[1],"Deep-Scrub: Max","max",mydiff)
302 | }
303 | if($NF == "ok") {
304 | MYEVENT="Deep-Scrub: OK"
305 | histoevent(MYDTSTAMP,MYEVENT,"inc")
306 | histototal(MYEVENT,1)
307 | osdevent($3,MYEVENT,"inc")
308 | osdtotal(MYEVENT,1)
309 | } else {
310 | MYEVENT="Deep-Scrub: Not OK"
311 | histoevent(MYDTSTAMP,MYEVENT,"inc")
312 | histototal(MYEVENT,1)
313 | osdevent($3,MYEVENT,"inc")
314 | osdtotal(MYEVENT,1)
315 | }
316 | }
317 | }
318 |
319 | /slow request / {
320 | MYDTSTAMP=mydtstamp($1" "$2)
321 | MYLINE=$0
322 | split(MYLINE,myparts,":")
323 | split(myparts[9],opparts," ")
324 | if (opparts[2] ~ /^[0-9]*\.[0-9a-f]*$/)
325 | split(opparts[2],pgid,".")
326 | else if (opparts[9] ~ /^[0-9]*\.[0-9a-f]*/)
327 | split(opparts[9],pgid,".")
328 | else if (myparts[8] ~ /pg_update_log_missing/) {
329 | split(myparts[8],temppgid," ")
330 | gsub(/^.*\(/,"",temppgid[1])
331 | split(temppgid[1],pgid,".")
332 | }
333 |
334 | if ($0 ~ / subops /) {
335 | split($0,junk," currently ")
336 | MYTYPE="Slow SubOp: "junk[2]
337 | gsub(/ [0-9,]*$/,"",MYTYPE)
338 | split($NF,subosds,",")
339 | for (subosd in subosds) {
340 | subosd="osd."subosds[subosd]
341 | if($12 < 60) {
342 | myeventstring="Slow SubOp,Slow Total,"MYTYPE
343 | osdhistoevent(MYDTSTAMP,subosd,"inc")
344 | osdhistototal(subosd,"inc")
345 | } else {
346 | myeventstring="Slow Relog SubOp,Slow Relog Total"
347 | }
348 | split(myeventstring,myevents,",")
349 | for(myevent in myevents) {
350 | histototal(myevents[myevent],1)
351 | histoevent(MYDTSTAMP,myevents[myevent],"inc")
352 | osdevent(subosd,myevents[myevent],"inc")
353 | osdtotal(myevents[myevent],1)
354 | poolevent(pgid[1],myevents[myevent],"inc")
355 | }
356 | }
357 | } else {
358 | split($0,junk," currently ")
359 | MYTYPE="Slow Primary: "junk[2]
360 | gsub(/ from [0-9]*/,"",MYTYPE)
361 | if($12 < 60) {
362 | myeventstring="Slow Primary,Slow Total,"MYTYPE
363 | osdhistoevent(MYDTSTAMP,$3,"inc")
364 | osdhistototal($3,"inc")
365 | } else {
366 | myeventstring="Slow Relog Primary,Slow Relog Total"
367 | }
368 | split(myeventstring,myevents,",")
369 | for(myevent in myevents) {
370 | histoevent(MYDTSTAMP,myevents[myevent],"inc")
371 | histototal(myevents[myevent],1)
372 | osdevent($3,myevents[myevent],"inc")
373 | osdtotal(myevents[myevent],1)
374 | poolevent(pgid[1],myevents[myevent],"inc")
375 | }
376 | }
377 | }
378 |
379 | / osdmap / {
380 | MYDTSTAMP=mydtstamp($1" "$2)
381 | histoevent(MYDTSTAMP,"OSDs","set",$11)
382 | histoevent(MYDTSTAMP,"OSDs UP","set",$13)
383 | histoevent(MYDTSTAMP,"OSDs IN","set",$15)
384 | }
385 |
386 | / osd\.[0-9]* out / {
387 | MYDTSTAMP=mydtstamp($1" "$2)
388 | MYEVENT="OSD Out"
389 | histoevent(MYDTSTAMP,MYEVENT,"inc")
390 | histototal(MYEVENT,1)
391 | if($9 ~ /^osd\./)
392 | osdpos=9
393 | if($11 ~ /^osd\./)
394 | osdpos=11
395 | osdevent($osdpos,MYEVENT,"inc")
396 | osdtotal(MYEVENT,1)
397 | }
398 |
399 | / but it is still running$/ {
400 | MYDTSTAMP=mydtstamp($1" "$2)
401 | MYEVENT="OSD Wrongly"
402 | histoevent(MYDTSTAMP,MYEVENT,"inc")
403 | histototal(MYEVENT,1)
404 | osdevent($3,MYEVENT,"inc")
405 | osdtotal(MYEVENT,1)
406 | }
407 |
408 | / wrongly marked me down$/ {
409 | MYDTSTAMP=mydtstamp($1" "$2)
410 | MYEVENT="OSD Wrongly"
411 | histoevent(MYDTSTAMP,MYEVENT,"inc")
412 | histototal(MYEVENT,1)
413 | osdevent($3,MYEVENT,"inc")
414 | osdtotal(MYEVENT,1)
415 | }
416 |
417 | / marked itself down / {
418 | MYDTSTAMP=mydtstamp($1" "$2)
419 | MYEVENT="OSD Down: Self"
420 | histoevent(MYDTSTAMP,MYEVENT,"inc")
421 | histototal(MYEVENT,1)
422 | osdevent($9,MYEVENT,"inc")
423 | osdtotal(MYEVENT,1)
424 | }
425 |
426 | /no active mgr/ {
427 | MYDTSTAMP=mydtstamp($1" "$2)
428 | MYEVENT="MGR: None Active"
429 | histoevent(MYDTSTAMP,MYEVENT,"inc")
430 | histototal(MYEVENT,1)
431 | }
432 |
433 | / calling new monitor election$/ {
434 | MYDTSTAMP=mydtstamp($1" "$2)
435 | MYEVENT="MON: Calling Election"
436 | histoevent(MYDTSTAMP,MYEVENT,"inc")
437 | histototal(MYEVENT,1)
438 | }
439 |
440 | / failed .*report.*from / {
441 | MYDTSTAMP=mydtstamp($1" "$2)
442 | MYEVENT="OSD Down: Reported Failed"
443 | histoevent(MYDTSTAMP,MYEVENT,"inc")
444 | histototal(MYEVENT,1)
445 | if($9 ~ /^osd\./)
446 | osdpos=9
447 | if($10 ~ /^osd\./)
448 | osdpos=10
449 | osdevent($osdpos,MYEVENT,"inc")
450 | osdtotal(MYEVENT,1)
451 | }
452 |
453 | / marked down after no pg stats for / {
454 | MYDTSTAMP=mydtstamp($1" "$2)
455 | MYEVENT="OSD Down: No PG stats"
456 | histoevent(MYDTSTAMP,MYEVENT,"inc")
457 | histototal(MYEVENT,1)
458 | osdevent($9,MYEVENT,"inc")
459 | osdtotal(MYEVENT,1)
460 | }
461 |
462 | / boot$/ {
463 | MYDTSTAMP=mydtstamp($1" "$2)
464 | MYEVENT="OSD Boot"
465 | histoevent(MYDTSTAMP,MYEVENT,"inc")
466 | histototal(MYEVENT,1)
467 | osdevent($10,MYEVENT,"inc")
468 | osdtotal(MYEVENT,1)
469 | }
470 |
471 | END {
472 |
473 | ## Begin outputting the histogram chart
474 | printf("DateTime")
475 | n=asorti(EVENTHEADERS)
476 | if(osdhisto!="")
477 | osdn=asorti(OSDEVENTHEADERS)
478 | for (i = 1; i<= n; i++ )
479 | printf(",%s",EVENTHEADERS[i])
480 | if(osdhisto!="") {
481 | for (i = 1; i<= osdn; i++)
482 | printf(",%s",OSDEVENTHEADERS[i])
483 | }
484 |
485 | printf("\n")
486 |
487 | dtcount=asorti(EVENTCOUNT,DTS)
488 |
489 | for (dtindex =1; dtindex <= dtcount; dtindex++) {
490 | DT=DTS[dtindex]
491 | printf("%s:00", DT)
492 | for (i = 1; i<= n; i++ )
493 | printf(",%s",EVENTCOUNT[DT][EVENTHEADERS[i]])
494 | if(osdhisto!="") {
495 | # add-on the per OSD histo columns
496 | for (i = 1; i<= osdn; i++ )
497 | printf(",%s",OSDEVENTCOUNT[DT][OSDEVENTHEADERS[i]])
498 | }
499 | printf("\n")
500 | }
501 |
502 | ## Begin outputting the column totals line
503 | printf("Totals")
504 | for (i = 1; i<= n; i++ )
505 | printf(",%s",EVENTTOTAL[EVENTHEADERS[i]])
506 | if(osdhisto!="") {
507 | for (i = 1; i<= osdn; i++ )
508 | printf(",%s",OSDEVENTTOTAL[OSDEVENTHEADERS[i]])
509 | }
510 |
511 | printf("\n")
512 | printf("\n")
513 |
514 | ## Begin outputting the OSD chart
515 | o=asorti(OSDHEADERS,OHDR)
516 |
517 | if(osdtree != "") {
518 | printf("OSD Tree Path,")
519 | for(pathindex=2;pathindex<=maxpathdepth;pathindex++)
520 | printf(",")
521 | }
522 |
523 | printf("osd.id")
524 | for (i = 1; i<= o; i++ ) {
525 | printf(",%s",OHDR[i])
526 | }
527 | printf("\n")
528 |
529 | if(osdtree=="") {
530 | for (OSD in OSDEVENT) {
531 | gsub(/^osd\./,"",OSD)
532 | OSDS[OSD]=OSD
533 | }
534 | osdcount=asort(OSDS)
535 | } else {
536 | osdcount=asorti(osdpathsbypath,OSDS)
537 | }
538 | for (osdindex=1; osdindex<=osdcount; osdindex++) {
539 | if(osdtree=="")
540 | osd="osd."OSDS[osdindex]
541 | else {
542 | osd=OSDS[osdindex]
543 | split(OSDS[osdindex],osdparts,",")
544 | osd=osdparts[length(osdparts)]
545 |
546 | printf("%s,",osdpaths[osd])
547 | split(osdpaths[osd],pathjunk,",")
548 | pathdepth=length(pathjunk)
549 | if(pathdepth=1;bindex--) {
592 | printf("%s,",BKS[bindex])
593 | split(BKS[bindex],bucketjunk,",")
594 | junklen=length(bucketjunk)
595 | for(i=junklen; i< maxpathdepth; i++)
596 | printf(",")
597 | for (i = 1; i<= o; i++ ) {
598 | if(BUCKETSUMMARY[BKS[bindex]][OHDR[i]]>0)
599 | printf(",%s",BUCKETSUMMARY[BKS[bindex]][OHDR[i]])
600 | else
601 | printf(",")
602 | }
603 | printf("\n")
604 | }
605 | } else {
606 | ## Or print column totals if Bucket Summary is not selected
607 | printf("Totals")
608 | if(osdtree != "") {
609 | for(pathindex=2;pathindex<=maxpathdepth;pathindex++)
610 | printf(",")
611 | }
612 | for (i = 1; i<= o; i++ ) {
613 | printf(",%s",OSDTOTAL[OHDR[i]])
614 | }
615 | }
616 |
617 | printf("\n\n")
618 |
619 | ## Begin outputting the Pool summary chart
620 | if ("Deep-Scrub: Count" in POOLHEADERS) {
621 | POOLHEADERS["Deep-Scrub: Average"]=1
622 | }
623 | poolcount=asorti(POOLEVENT,poolids)
624 | phdrcount=asorti(POOLHEADERS,PHDR)
625 | printf("Pool ID")
626 | for(phdrindex=1;phdrindex<=phdrcount;phdrindex++)
627 | printf(",%s",PHDR[phdrindex])
628 | printf("\n")
629 | for(pindex=1;pindex<=poolcount;pindex++) {
630 | printf("%s",poolids[pindex])
631 | for(phdrindex=1;phdrindex<=phdrcount;phdrindex++) {
632 | if(PHDR[phdrindex]=="Deep-Scrub: Average") {
633 | if(POOLEVENT[poolids[pindex]]["Deep-Scrub: Count"])
634 | printf(",%0.6f",POOLEVENT[poolids[pindex]]["Deep-Scrub: Total"]/POOLEVENT[poolids[pindex]]["Deep-Scrub: Count"])
635 | else
636 | printf(",")
637 | } else
638 | printf(",%s",POOLEVENT[poolids[pindex]][PHDR[phdrindex]])
639 | }
640 | printf("\n")
641 | }
642 | }
643 |
644 |
645 |
--------------------------------------------------------------------------------
/compacting_timing.awk:
--------------------------------------------------------------------------------
1 | #!/usr/bin/awk -f
2 |
3 | ##
4 | ## Provide an OSD log for timing output of each leveldb Compaction event
5 | ## ./compacting_timing.awk ceph-osd.10.log
6 | ##
7 | ##
8 |
9 | BEGIN {
10 | begtime=0
11 | endtime=0
12 | }
13 | /leveldb: Compact/ {
14 | MYLINE=$0
15 | gsub(/[-:]/," ",$1)
16 | gsub(/[-:]/," ",$2)
17 | MYTIME=mktime($1" "$2)
18 | split($2,secs,".")
19 | millisecs=sprintf("0.%s",secs[2])
20 | MYTIME+=millisecs
21 |
22 | if(begtime==0) {
23 | begtime=MYTIME
24 | }
25 | if(MYTIME>endtime) {
26 | endtime=MYTIME
27 | }
28 | if($6=="Compacting") {
29 | MYSTART=MYTIME
30 | next
31 | }
32 |
33 | if(MYSTART!="") {
34 | mydiff=MYTIME-MYSTART
35 | if(mydiffmymax || mymin=="") {
40 | mymaxreq=MYLINE
41 | mymax=mydiff
42 | }
43 | mysum+=mydiff
44 | mycount++
45 | printf("%s,%s\n", mydiff, MYLINE)
46 | MYSTART=""
47 | }
48 | }
49 | END {
50 | if(mycount=="")
51 | mycount=1
52 | printf("Min,Avg,Max,Total Time Spent,%Time spent in compaction\n%s,%s,%s,%s,%s\nMin Req: %s\nMax Req: %s\n",mymin,mysum/mycount,mymax,mysum,mysum/(endtime-begtime)*100,myminreq,mymaxreq)
53 | }
54 |
--------------------------------------------------------------------------------
/deep-scrub_timing.awk:
--------------------------------------------------------------------------------
1 | #!/usr/bin/awk -f
2 |
3 | ###
4 | #
5 | # Pipe a 'ceph.log' file into the script, redirect the output to a .csv file
6 | #
7 | # cat ceph.log | deep-scrub_histo.awk > deep-scrub_histo.csv
8 | #
9 | # Added ability to map acting OSDs to the PG scrub line
10 | #
11 | # Example:
12 | # cat ceph.log | deep-scrub_histo.awk -v pgdump=/path/to/pgdump > deep-scrub_histo.csv
13 | #
14 | ###
15 |
16 | function safediv(a,b) {
17 | if(b==0) {
18 | return 0
19 | } else {
20 | return a/b
21 | }
22 | }
23 |
24 | BEGIN {
25 | if(pgdump != "") {
26 | while(( getline line 0) {
27 | split(line,a," ")
28 | if(a[1] ~ /[0-9]*\.[0-9a-f]*/)
29 | gsub(/[\[\]]/, "", a[15])
30 | gsub(/,/, ",osd.", a[15])
31 | PGsToOSD[a[1]]="osd."a[15]
32 | }
33 | }
34 | }
35 |
36 | /deep-scrub/ {
37 | MYLINE=$0
38 | MYPG=$9
39 | gsub(/[-:]/," ",$1)
40 | gsub(/[-:]/," ",$2)
41 | MYTIME=mktime($1" "$2)
42 | split($2,secs,".")
43 | millisecs=sprintf("0.%s",secs[2])
44 | MYTIME+=millisecs
45 |
46 | if($NF=="starts") {
47 | MYSTART[MYPG]=MYTIME
48 | next
49 | }
50 |
51 | if(MYSTART[MYPG]!="") {
52 | mydiff=MYTIME-MYSTART[MYPG]
53 | if(mydiffmymax || mymin=="") {
58 | mymaxreq=MYLINE
59 | mymax=mydiff
60 | }
61 | mysum+=mydiff
62 | mycount++
63 | printf("%s,%s,%s\n", mydiff,PGsToOSD[MYPG],MYLINE)
64 | }
65 | }
66 | END {
67 | printf("Min,Avg,Max\n%s,%s,%s\nMin Req: %s\nMax Req: %s\n",mymin,safediv(mysum,mycount),mymax,myminreq,mymaxreq)
68 | }
69 |
--------------------------------------------------------------------------------
/images/Histogram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linuxkidd/ceph-log-parsers/76697e2c0cb080217c2a93fda7de4e150e618e51/images/Histogram.png
--------------------------------------------------------------------------------
/images/OSDs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linuxkidd/ceph-log-parsers/76697e2c0cb080217c2a93fda7de4e150e618e51/images/OSDs.png
--------------------------------------------------------------------------------
/images/Pools.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linuxkidd/ceph-log-parsers/76697e2c0cb080217c2a93fda7de4e150e618e51/images/Pools.png
--------------------------------------------------------------------------------
/iops_histo.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | if [ -z $1 ] ; then
4 | echo
5 | echo ' Usage:'
6 | echo
7 | echo " $(basename $0) {ceph.log}"
8 | echo
9 | exit 1
10 | fi
11 |
12 | echo xThousand,Count
13 |
14 | grep pgmap $1 | awk -F\; '{split($3,a," "); print int(a[7]/1000) }' | sort -n | grep . | uniq -c | awk '{print $2","$1}'
15 |
--------------------------------------------------------------------------------
/map_reporters_to_buckets.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | if [ -z $2 ]; then
4 | echo
5 | echo "Usage: $(basename $0) "
6 | echo
7 | exit 1
8 | fi
9 |
10 | if [ ! -e $1 ]; then
11 | echo "File $1 does not exist."
12 | fi
13 |
14 | if [ ! -e $2 ]; then
15 | echo "File $2 does not exist."
16 | fi
17 |
18 | counters="f b"
19 |
20 | declare -A bucket_counts
21 | declare -A local_count
22 | tmpfile=$(mktemp)
23 |
24 | echo -n "Searching..." >&2
25 | awk '/reported failed/ { printf("f,%s\nb,%s\n",$9,$14)}' $1 | sort -n | uniq -c > $tmpfile
26 | echo -n ", mapping to buckets" >&2
27 |
28 | buckets=()
29 | currbuckets=()
30 | inhost=0
31 | depth_count=0
32 |
33 | echo "buckets...,reported,reporter"
34 |
35 | while read line; do
36 | thirdcol=$(echo $line | awk '{print $3}' | sed -e 's/[^0-9a-zA-Z_]/_/g')
37 | forthcol=$(echo $line | awk '{print $4}')
38 | if [ $(echo $thirdcol | grep -ic "^type$") -gt 0 ]; then
39 | continue
40 | fi
41 | if [ $(echo $thirdcol | grep -c ^osd\.) -gt 0 ]; then
42 |
43 |
44 | for j in $counters; do
45 | local_count[$j]=$(awk -v p="${j},${thirdcol}" '{if ( $2 == p ) { print $1 }}' $tmpfile)
46 | if [ -z ${local_count[$j]} ]; then
47 | local_count[$j]=0
48 | fi
49 | done
50 |
51 | for ((i = 0; i < ${#buckets[*]} ; i++)) {
52 | bucket=${buckets[$i]}
53 | for j in $counters; do
54 | ((bucket_counts[$j,$i]+=${local_count[$j]}))
55 | done
56 | echo -n "${!bucket},"
57 | }
58 | echo -n "$thirdcol"
59 | for j in $counters; do
60 | ((bucket_counts[$j,$i]+=${local_count[$j]}))
61 | echo -n ,${local_count[$j]}
62 | done
63 | echo
64 | else
65 | havebucket=-1
66 | for ((i = 0; i < ${#buckets[*]} ; i++)) {
67 | if [ ${buckets[$i]} == $thirdcol ]; then
68 | havebucket=$i
69 | fi
70 | }
71 | if [ $havebucket -eq -1 ]; then
72 | buckets+=($thirdcol)
73 | ((i++))
74 | for j in $counters; do
75 | bucket_counts[$j,$i]=0
76 | done
77 | else
78 | highest_bucket=${#buckets[*]}
79 | for ((k = $highest_bucket; k > $havebucket; k--)) {
80 | for ((i = 0; i < $k ; i++)); do
81 | bucket=${buckets[$i]}
82 | echo -n "${!bucket},"
83 | done
84 | for j in $counters; do
85 | echo -n ${bucket_counts[$j,$k]},
86 | bucket_counts[$j,$k]=0
87 | done
88 | echo
89 | if [ $k -gt $(($havebucket+1)) ]; then
90 | unset buckets[${#buckets[*]}-1]
91 | fi
92 | }
93 | fi
94 | declare "${thirdcol}=${forthcol}"
95 | fi
96 | done < $2
97 | highest_bucket=${#buckets[*]}
98 | for ((k = $highest_bucket; k > 0; k--)) {
99 | for ((i = 0; i < $k ; i++)); do
100 | bucket=${buckets[$i]}
101 | echo -n "${!bucket},"
102 | done
103 | for j in $counters; do
104 | echo -n ${bucket_counts[$j,$k]},
105 | bucket_counts[$j,$k]=0
106 | rm -f $files[$j]
107 | done
108 | echo
109 | }
110 | echo >&2
111 |
--------------------------------------------------------------------------------
/rgw_proc_time.awk:
--------------------------------------------------------------------------------
1 | #!/usr/bin/awk -f
2 | BEGIN {
3 | MYMONTHS["Jan"]=1
4 | MYMONTHS["Feb"]=2
5 | MYMONTHS["Mar"]=3
6 | MYMONTHS["Apr"]=4
7 | MYMONTHS["May"]=5
8 | MYMONTHS["Jun"]=6
9 | MYMONTHS["Jul"]=7
10 | MYMONTHS["Aug"]=8
11 | MYMONTHS["Sep"]=9
12 | MYMONTHS["Oct"]=10
13 | MYMONTHS["Nov"]=11
14 | MYMONTHS["Dec"]=12
15 | }
16 |
17 | {
18 | gsub(/[-:]/," ",$1)
19 | gsub(/[-:]/," ",$2)
20 | ENDTIME=mktime($1" "$2)
21 | split($2,secs,".")
22 | millisecs=sprintf("0.%s",secs[2])
23 | ENDTIME+=millisecs
24 |
25 | sub(/^./,"",$10)
26 | gsub(/[\/\-:]/," ",$10)
27 | maxb=split($10,b," ")
28 | b[2]=sprintf("%02d",MYMONTHS[b[2]])
29 | STARTTIMESTRING=b[3]" "b[2]" "b[1]" "b[4]" "b[5]" "b[6]
30 | STARTTIME=mktime(STARTTIMESTRING)
31 | delta=ENDTIME-STARTTIME
32 | print $1" "$2" ("ENDTIME") -"STARTTIMESTRING" ("STARTTIME") :: "delta
33 | }
34 |
--------------------------------------------------------------------------------
/rgw_req_timing.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | if [ -z $1 ]; then
4 | echo
5 | echo "Usage: $(basename $0) "
6 | echo
7 | exit 1
8 | fi
9 |
10 | if [ ! -e $1 ]; then
11 | echo "File $1 does not exist."
12 | fi
13 |
14 | awk '
15 | BEGIN {
16 | mymin=999
17 | mymax=0
18 | mysum=0
19 | mycount=0
20 | }
21 | /req=/ {
22 | MYLINE=$0
23 | gsub(/[-:]/," ",$1)
24 | gsub(/[-:]/," ",$2)
25 | MYTIME=mktime($1" "$2)
26 | split($2,secs,".")
27 | millisecs=sprintf("0.%s",secs[2])
28 | MYTIME+=millisecs
29 | if(match(MYLINE,/starting new request/)) {
30 | MYREQ=$9
31 | MYSTART[MYREQ]=MYTIME
32 | }
33 | if(match(MYLINE,/req done/)) {
34 | MYREQ=$8
35 | if(MYSTART[MYREQ]!="") {
36 | mydiff=MYTIME-MYSTART[MYREQ]
37 | if(mydiffmymax) {
42 | mymaxreq=MYLINE
43 | mymax=mydiff
44 | }
45 | mysum+=mydiff
46 | mycount++
47 | printf("%s,%s\n", mydiff, MYLINE)
48 | }
49 | }
50 | }
51 | END {
52 | printf("Min,Avg,Max\n%s,%s,%s\nMin Req: %s\nMax Req: %s\n",mymin,mysum/mycount,mymax,myminreq,mymaxreq)
53 | }
54 | ' $1
55 |
--------------------------------------------------------------------------------
/slow_by_osd-pool-type.awk:
--------------------------------------------------------------------------------
1 | #!/usr/bin/awk -f
2 | #
3 | # by: Michael Kidd
4 | # https://github.com/linuxkidd
5 | #
6 | # Use:
7 | # chmod 755 slow_by_osd-pool-type.awk
8 | # ./slow_by_osd-pool-type.awk ceph.log
9 | #
10 | # Output Options:
11 | # -v csv=1
12 | # -v pivot=1
13 | # -v perline=1
14 | #
15 | # Note: with no '-v' option specified, it provides 'visual' output for easy human parsing
16 | # Note2: Only one output option may be used per execution
17 | #
18 | #
19 |
20 | BEGIN {
21 | PROCINFO["sorted_in"] = "@val_num_asc"
22 | }
23 |
24 | /slow request [3-5][0-9]\./ {
25 | if($20 ~ /^[0-9]*\.[0-9a-fs]*$/) {
26 | split($20,a,".")
27 | b=$0
28 | gsub(/^.*currently /,"",b)
29 | gsub(/ from .*/, "", b)
30 | slowtype[b]++
31 | slowosd[$3]++
32 | slowosdbytype[$3][b]++
33 | slowbypool[a[1]]++
34 | slowbypooltype[a[1]][b]++
35 | slowpoolosdtype[a[1]][$3][b]++
36 | slowtypepools[b][a[1]]++
37 | }
38 | }
39 |
40 | function printVisual() {
41 | print "Pool stats: "
42 | for(p in slowbypool) {
43 | print "Pool id: "p" Total slow: "slowbypool[p]
44 | for (t in slowbypooltype[p]) {
45 | print "\t"slowbypooltype[p][t]"\t"t
46 | }
47 | }
48 | print ""
49 | print ""
50 | print "OSD Stats: "
51 | for (o in slowosd) {
52 | print "\t"o" "slowosd[o]
53 | for (t in slowosdbytype[o]) {
54 | print "\t\t"slowosdbytype[o][t]" "t
55 | }
56 | }
57 | print ""
58 | print ""
59 | print "Slow by Type: "
60 | for (t in slowtype) {
61 | print "\t"slowtype[t]" "t
62 | }
63 | }
64 |
65 | function printCSV() {
66 | printf("Pool,")
67 | for(t in slowtype) {
68 | printf("%s,",t)
69 | }
70 | print ""
71 | for(p in slowbypool) {
72 | printf("%s,",p)
73 | for (t in slowtype) {
74 | printf("%d,",slowbypooltype[p][t])
75 | }
76 | print ""
77 | }
78 | printf("Total:,")
79 | for (t in slowtype) {
80 | printf("%d,",slowtype[t])
81 | }
82 | print ""
83 | print ""
84 | printf("OSD,")
85 | for(t in slowtype) {
86 | printf("%s,",t)
87 | }
88 | print ""
89 | for (o in slowosd) {
90 | printf("%s,",o)
91 | for (t in slowtype) {
92 | printf("%s,",slowosdbytype[o][t])
93 | }
94 | print ""
95 | }
96 | printf("Total:,")
97 | for (t in slowtype) {
98 | printf("%d,",slowtype[t])
99 | }
100 | print ""
101 | }
102 |
103 | function printPerLine() {
104 | print "Pool,OSD,Type,Count"
105 | for(p in slowpoolosdtype){
106 | for(o in slowpoolosdtype[p]) {
107 | for(t in slowpoolosdtype[p][o])
108 | print p","o","t","slowpoolosdtype[p][o][t]
109 | }
110 | }
111 | }
112 |
113 | function printPivot() {
114 | printf(",")
115 | for(t in slowtype) {
116 | printf("%s",t)
117 | for(p in slowtypepools[t]) {
118 | l2=l2","p
119 | ptotal=ptotal","slowtypepools[t][p]
120 | sumtotal+=slowtypepools[t][p]
121 | printf(",")
122 | }
123 | }
124 | print "Totals"
125 | printf("OSD / Pool ID%s\n",l2)
126 | for(o in slowosd) {
127 | printf("%s,",o)
128 | for(t in slowtype) {
129 | for(p in slowtypepools[t]) {
130 | if(slowpoolosdtype[p][o][t]>0)
131 | printf("%d,",slowpoolosdtype[p][o][t])
132 | else
133 | printf(",")
134 | }
135 | }
136 | print slowosd[o]
137 | }
138 | print "Totals:"ptotal","sumtotal
139 | }
140 |
141 | END {
142 | if(csv==1)
143 | printCSV()
144 | else if(pivot==1)
145 | printPivot()
146 | else if(perline==1)
147 | printPerLine()
148 | else
149 | printVisual()
150 | }
151 |
152 |
--------------------------------------------------------------------------------