├── LICENSE ├── README.md ├── ceph_log_parser.awk ├── ceph_log_parser.luminous.awk ├── compacting_timing.awk ├── deep-scrub_timing.awk ├── images ├── Histogram.png ├── OSDs.png └── Pools.png ├── iops_histo.sh ├── map_reporters_to_buckets.sh ├── rgw_proc_time.awk ├── rgw_req_timing.sh └── slow_by_osd-pool-type.awk /LICENSE: -------------------------------------------------------------------------------- 1 | GNU LESSER GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | 9 | This version of the GNU Lesser General Public License incorporates 10 | the terms and conditions of version 3 of the GNU General Public 11 | License, supplemented by the additional permissions listed below. 12 | 13 | 0. Additional Definitions. 14 | 15 | As used herein, "this License" refers to version 3 of the GNU Lesser 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU 17 | General Public License. 18 | 19 | "The Library" refers to a covered work governed by this License, 20 | other than an Application or a Combined Work as defined below. 21 | 22 | An "Application" is any work that makes use of an interface provided 23 | by the Library, but which is not otherwise based on the Library. 24 | Defining a subclass of a class defined by the Library is deemed a mode 25 | of using an interface provided by the Library. 26 | 27 | A "Combined Work" is a work produced by combining or linking an 28 | Application with the Library. The particular version of the Library 29 | with which the Combined Work was made is also called the "Linked 30 | Version". 31 | 32 | The "Minimal Corresponding Source" for a Combined Work means the 33 | Corresponding Source for the Combined Work, excluding any source code 34 | for portions of the Combined Work that, considered in isolation, are 35 | based on the Application, and not on the Linked Version. 36 | 37 | The "Corresponding Application Code" for a Combined Work means the 38 | object code and/or source code for the Application, including any data 39 | and utility programs needed for reproducing the Combined Work from the 40 | Application, but excluding the System Libraries of the Combined Work. 41 | 42 | 1. Exception to Section 3 of the GNU GPL. 43 | 44 | You may convey a covered work under sections 3 and 4 of this License 45 | without being bound by section 3 of the GNU GPL. 46 | 47 | 2. Conveying Modified Versions. 48 | 49 | If you modify a copy of the Library, and, in your modifications, a 50 | facility refers to a function or data to be supplied by an Application 51 | that uses the facility (other than as an argument passed when the 52 | facility is invoked), then you may convey a copy of the modified 53 | version: 54 | 55 | a) under this License, provided that you make a good faith effort to 56 | ensure that, in the event an Application does not supply the 57 | function or data, the facility still operates, and performs 58 | whatever part of its purpose remains meaningful, or 59 | 60 | b) under the GNU GPL, with none of the additional permissions of 61 | this License applicable to that copy. 62 | 63 | 3. Object Code Incorporating Material from Library Header Files. 64 | 65 | The object code form of an Application may incorporate material from 66 | a header file that is part of the Library. You may convey such object 67 | code under terms of your choice, provided that, if the incorporated 68 | material is not limited to numerical parameters, data structure 69 | layouts and accessors, or small macros, inline functions and templates 70 | (ten or fewer lines in length), you do both of the following: 71 | 72 | a) Give prominent notice with each copy of the object code that the 73 | Library is used in it and that the Library and its use are 74 | covered by this License. 75 | 76 | b) Accompany the object code with a copy of the GNU GPL and this license 77 | document. 78 | 79 | 4. Combined Works. 80 | 81 | You may convey a Combined Work under terms of your choice that, 82 | taken together, effectively do not restrict modification of the 83 | portions of the Library contained in the Combined Work and reverse 84 | engineering for debugging such modifications, if you also do each of 85 | the following: 86 | 87 | a) Give prominent notice with each copy of the Combined Work that 88 | the Library is used in it and that the Library and its use are 89 | covered by this License. 90 | 91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license 92 | document. 93 | 94 | c) For a Combined Work that displays copyright notices during 95 | execution, include the copyright notice for the Library among 96 | these notices, as well as a reference directing the user to the 97 | copies of the GNU GPL and this license document. 98 | 99 | d) Do one of the following: 100 | 101 | 0) Convey the Minimal Corresponding Source under the terms of this 102 | License, and the Corresponding Application Code in a form 103 | suitable for, and under terms that permit, the user to 104 | recombine or relink the Application with a modified version of 105 | the Linked Version to produce a modified Combined Work, in the 106 | manner specified by section 6 of the GNU GPL for conveying 107 | Corresponding Source. 108 | 109 | 1) Use a suitable shared library mechanism for linking with the 110 | Library. A suitable mechanism is one that (a) uses at run time 111 | a copy of the Library already present on the user's computer 112 | system, and (b) will operate properly with a modified version 113 | of the Library that is interface-compatible with the Linked 114 | Version. 115 | 116 | e) Provide Installation Information, but only if you would otherwise 117 | be required to provide such information under section 6 of the 118 | GNU GPL, and only to the extent that such information is 119 | necessary to install and execute a modified version of the 120 | Combined Work produced by recombining or relinking the 121 | Application with a modified version of the Linked Version. (If 122 | you use option 4d0, the Installation Information must accompany 123 | the Minimal Corresponding Source and Corresponding Application 124 | Code. If you use option 4d1, you must provide the Installation 125 | Information in the manner specified by section 6 of the GNU GPL 126 | for conveying Corresponding Source.) 127 | 128 | 5. Combined Libraries. 129 | 130 | You may place library facilities that are a work based on the 131 | Library side by side in a single library together with other library 132 | facilities that are not Applications and are not covered by this 133 | License, and convey such a combined library under terms of your 134 | choice, if you do both of the following: 135 | 136 | a) Accompany the combined library with a copy of the same work based 137 | on the Library, uncombined with any other library facilities, 138 | conveyed under the terms of this License. 139 | 140 | b) Give prominent notice with the combined library that part of it 141 | is a work based on the Library, and explaining where to find the 142 | accompanying uncombined form of the same work. 143 | 144 | 6. Revised Versions of the GNU Lesser General Public License. 145 | 146 | The Free Software Foundation may publish revised and/or new versions 147 | of the GNU Lesser General Public License from time to time. Such new 148 | versions will be similar in spirit to the present version, but may 149 | differ in detail to address new problems or concerns. 150 | 151 | Each version is given a distinguishing version number. If the 152 | Library as you received it specifies that a certain numbered version 153 | of the GNU Lesser General Public License "or any later version" 154 | applies to it, you have the option of following the terms and 155 | conditions either of that published version or of any later version 156 | published by the Free Software Foundation. If the Library as you 157 | received it does not specify a version number of the GNU Lesser 158 | General Public License, you may choose any version of the GNU Lesser 159 | General Public License ever published by the Free Software Foundation. 160 | 161 | If the Library as you received it specifies that a proxy can decide 162 | whether future versions of the GNU Lesser General Public License shall 163 | apply, that proxy's public statement of acceptance of any version is 164 | permanent authorization for you to choose that version for the 165 | Library. 166 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ceph-log-parsers 2 | Tools for parsing ceph logs to help with troubleshooting various issues. 3 | 4 | ## Tool Explanations: 5 | NOTE: I've shortened the sample outputs below with elipses for the sake of brevity. 6 | 7 | #### ceph_log_parser.awk 8 | - Run with ceph.log and redirect output to a CSV 9 | 10 | ``` 11 | ./ceph_log_parser.awk ceph.log > ceph-log-parsed.csv 12 | ./ceph_log_parser.awk -v osdtree=ceph_osd_tree.txt -v timeinterval=60 -v bucketsummary=1 ceph.log > ceph-log-parsed.csv 13 | ``` 14 | 15 | Available options: 16 | 17 | -v osdtree=ceph_osd_tree.txt 18 | If provided, the osd output portion will be output with its branch path in the crushmap 19 | 20 | -v timeinterval=(1|10|60|day) 21 | If provided, adjusts the time alignment for the histogram output. Default is 10 (minutes) 22 | 23 | -v bucketsummary=1 24 | If provided, provides an output below the OSD data summarizing the OSD counts for each 25 | successive bucket branch above the OSD ( example: host, rack, row, root ) 26 | Default is 1 if 'osdtree' is defined. 27 | 28 | -v osdhisto=1 29 | Provides a column per OSD in the time histogram showing initial 'slow request' entries 30 | incurred by that OSD during the time interval. 31 | Default is disabled because this can make VERY wide spreadsheets 32 | 33 | NOTE: These options MUST be specified **BEFORE** the ceph.log file, otherwise they will be 34 | ignored 35 | 36 | 37 | * For items which are average, these are summed and averaged over the measurement interval 38 | The measurement is reported at the beginning of the interval measurement period 39 | 40 | e.g IO: Client Read MB/s for 03:30 to 03:40 is averaged, then reported on the 03:30 line 41 | 42 | * For items which are a static snapshot, these are reported based on the last line containing those 43 | details in the log before the end of the measurement interval 44 | 45 | e.g. PG: active for 03:30 to 03:40 - If a pgmap is found at 03:39:59, that will be the one reported for 46 | the 03:30 line 47 | 48 | * For items like the Slow requests, the count of those entries is summed during the measured period and reported 49 | 50 | e.g. If there are 50 'slow request ' logs in the 10 minute interval which are for a primary OSD, then 50 is reported 51 | If there are 50 'slow request ' logs 'waiting for subop', then the OSDs called out by the subop (comma 52 | separated numbers), are all counted in the 'Slow SubOp' line. For 3x replication, and 50 lines, the reported 53 | number would be 100 (due to 2x non-primary copies * 50 lines) 54 | 55 | * NOTE: Slow request processing has changed as of 27 Feb 2018. The intial slow request (delay < 60 seconds) and 56 | relogged slow requests ( delay>60 seconds ) are logged separately to better understand if an issue is ongoing. 57 | 58 | 59 | ##### ATTENTION: 60 | - This command output among all others really should be looked at in a spreadsheet tool. I typically highlight the headers (at the top of each section), bold them, rotate them so the text is vertical, then auto-adjust the column widths to get a more concise view which is much easier to visually parse. Graphing of the data in this report can also make trends stand out and help with reducing the scope for hunting faulting components. 61 | 62 | ###### Example: 63 | ``` 64 | # ./ceph_log_parser.awk -v osdtree=ceph_osd_tree.txt -v timeinterval=10 -v bucketsummary=1 ceph.log > ~/ceph-log-parsed.csv 65 | # cat ~/ceph-log-parsed.csv 66 | 67 | DateTime,Deep-Scrub: OK,Deep-Scrub: Starts,IO: Client Avg IOPs,IO: Client Avg Read MB/s,IO: Client Avg Write MB/s,IO: Recovery Avg MB/s,OSD Boot,OSD Down: No PG stats,OSD Down: Reported Failed,OSD Out, 68 | OSD Wrongly,OSDs,OSDs IN,OSDs UP,Objects: Degraded Percent,Objects: Misplaced Percent,PG: activating,PG: active,PG: backfill_toofull,PG: backfilling,PG: clean,PG: deep,PG: degraded,PG: down,PG: inactiv 69 | e,PG: incomplete,PG: peered,PG: peering,PG: recovering,PG: recovery_wait,PG: remapped,PG: scrubbing,PG: stale,PG: undersized,PG: wait_backfill,Slow Primary,Slow Primary: commit_sent,Slow Primary: no fl 70 | ag points reached,Slow Primary: reached_pg,Slow Primary: started,Slow Primary: waiting for degraded object,Slow Primary: waiting for missing object,Slow Primary: waiting for ondisk,Slow Primary: waitin 71 | g for rw locks,Slow Primary: waiting for scrub,Slow SubOp,Slow Total,Space (TB): Data Stored,Space (TB): Free,Space (TB): Raw Used,Space (TB): Total 72 | 2018-01-21 03:10:00,,6,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, 73 | 2018-01-21 03:20:00,19,12,10193.47,132.71,86.42,,,,,,,249,236,236,,,,32960,,,32960,,,,,,,,,,,,,,,,,,,,,,,,,,,133.00,235.00,405.00,641.00 74 | 2018-01-21 03:30:00,6,7,11243.27,214.92,70.60,,,,,,,,,,,,,32960,,,32960,1,,,,,,,,,,1,,,,,,,,,,,,,,,,133.00,235.00,405.00,641.00 75 | 2018-01-21 03:40:00,9,8,9566.01,202.62,73.42,,,,,,,249,236,236,,,,32960,,,32960,,,,,,,,,,,,,,,,,,,,,,,,,,,133.00,235.00,405.00,641.00 76 | 2018-01-21 03:50:00,1,1,8549.33,163.93,71.18,,,,,,,249,236,236,,,,32960,,,32960,,,,,,,,,,,,,,,,,,,,,,,,,,,133.00,235.00,405.00,641.00 77 | 2018-01-21 04:00:00,,,8331.46,121.57,65.20,,,,,,,,,,,,,32960,,,32960,,,,,,,,,,,,,,,,,,,,,,,,,,,133.00,235.00,405.00,641.00 78 | 2018-01-21 04:10:00,11,13,7480.16,58.25,80.61,,,,,,,249,236,236,,,,32960,,,32960,2,,,,,,,,,,2,,,,,,,,,,,,,,,,133.00,235.00,405.00,641.00 79 | 2018-01-21 04:20:00,13,11,7202.10,41.08,66.31,,,,,,,,,,,,,32960,,,32960,,,,,,,,,,,,,,,,,,,,,,,,,,,133.00,235.00,405.00,641.00 80 | ... 81 | 2018-01-22 03:20:00,13,21,7216.23,88046.62,99450.79,0.00,141,,199,39,144,249,197,178,19.106,10.282,500,26598,,88,15737,4,13503,26,2,,2221,3639,143,2206,5011,4,780,11410,4486,80786,106,4351,74801,7,43,3,,1475,,5465,86251,132.00,198.00,337.00,535.00 82 | 2018-01-22 03:30:00,12,18,9517.31,156494.72,200462.23,0.00,,3,8,22,,249,175,167,26.780,21.825,5,29840,,90,17326,11,15559,66,,,3042,73,63,3678,9054,11,816,12360,8934,45929,292,8915,36457,18,2,5,,240,,6408,52337,133.00,174.00,300.00,475.00 83 | 2018-01-22 03:40:00,25,18,5481.73,52358.01,71523.06,0.00,1,,1,8,1,249,167,167,26.497,25.867,7,29920,,55,19215,3,13677,66,,,2967,66,58,3391,10242,3,816,10242,10167,18953,146,4204,14561,15,1,3,,23,,2464,21417,133.00,165.00,288.00,453.00 84 | 2018-01-22 03:50:00,10,11,7.30,0.03,0.08,0.00,,,,,,249,167,167,26.201,25.723,1,29942,,66,20370,4,12523,66,,,2951,66,43,2299,10192,4,816,10192,10115,15095,94,3218,11773,10,,,,,,1422,16517,133.00,163.00,290.00,453.00 85 | 2018-01-22 04:00:00,27,33,6.93,0.00,0.07,0.00,,,,,,249,167,167,25.892,25.512,,29981,,76,21228,10,11666,66,,,2913,66,23,1514,10136,10,816,10136,10053,10415,138,2452,7809,4,,,,12,,914,11329,133.00,162.00,291.00,453.00 86 | 2018-01-22 04:10:00,37,38,11.08,5704.84,7398.04,0.00,1,,1,,1,249,167,167,25.571,25.226,,30042,,93,21682,10,11211,66,,,2852,66,12,1138,10068,10,816,10068,9968,4657,63,885,3697,12,,,,,,518,5175,133.00,160.00,293.00,453.00 87 | 2018-01-22 04:20:00,28,22,5.14,0.10,0.15,0.00,,,,,,249,167,167,25.219,24.890,1,30116,,103,22079,5,10814,66,,,2777,66,14,818,9986,5,816,9986,9879,7952,38,1040,6870,4,,,,,,584,8536,133.00,159.00,294.00,453.00 88 | 2018-01-22 04:30:00,13,12,50.82,15611.23,13142.18,0.00,1,,1,,1,249,167,167,24.858,24.539,,30198,,104,22402,4,10492,66,,,2696,66,5,590,9900,4,816,9900,9793,10170,52,2320,7795,3,,,,,,818,10988,133.00,157.00,296.00,453.00 89 | 2018-01-22 04:40:00,18,23,12.58,0.02,0.03,0.00,,,1,,,249,167,166,24.873,24.018,,30209,,111,22214,7,10665,77,,4,2670,77,6,455,9754,7,816,10208,9640,6481,63,1135,5278,5,,,,,,554,7035,133.00,156.00,297.00,453.00 90 | 2018-01-22 04:50:00,22,18,146.06,1123.10,1301.40,0.00,,,,1,,249,166,166,24.490,24.207,,30304,,96,22559,5,10320,72,,9,2575,72,3,490,9829,5,816,9828,9731,3935,36,626,3271,2,,,,,,587,4522,133.00,153.00,297.00,451.00 91 | 2018-01-22 05:00:00,16,16,740.75,10099.43,9356.42,0.00,2,,2,,3,249,166,166,24.126,23.865,,30391,6,104,22789,5,10090,72,,9,2488,72,3,350,9740,5,816,9739,9633,4908,60,598,4234,2,,,,14,,626,5534,133.00,152.00,298.00,451.00 92 | 2018-01-22 05:10:00,25,26,102.86,20550.58,21074.40,0.00,2,,3,,2,249,166,165,23.962,23.374,52,30332,7,97,22643,5,10191,77,,11,2450,115,7,273,9618,5,847,9916,9485,5562,83,818,4653,3,2,,,3,,1012,6574,133.00,150.00,300.00,451.00 93 | ... 94 | 95 | OSD Tree Path,,,,osd.id,Deep-Scrub: OK,Deep-Scrub: Starts,OSD Boot,OSD Down: No PG stats,OSD Down: Reported Failed,OSD Out,OSD Wrongly,Slow Primary,Slow Primary: commit_sent,Slow Primary: no flag points reached,Slow Primary: reached_pg,Slow Primary: started,Slow Primary: waiting for degraded object,Slow Primary: waiting for missing object,Slow Primary: waiting for ondisk,Slow Primary: waiting for rw locks,Slow Primary: waiting for scrub,Slow SubOp,Slow Total 96 | default,row1,rack1,osd-node-003,osd.0,37,37,2,,1,1,,1265,8,69,927,,2,,,259,,,1265 97 | default,row1,rack1,osd-node-003,osd.2,26,26,3,,2,,2,2745,24,39,2046,3,6,,,627,,34081,36826 98 | default,row1,rack1,osd-node-003,osd.3,15,15,3,,2,,2,1547,7,188,1065,3,,,,284,,680,2227 99 | default,row1,rack1,osd-node-003,osd.4,28,29,3,,2,1,1,1590,,1,934,,1,,,654,,35,1625 100 | default,row1,rack1,osd-node-003,osd.5,24,27,2,,1,,1,3038,14,305,1982,,1,,,736,,2,3040 101 | default,row1,rack1,osd-node-003,osd.6,31,31,2,,1,,1,2281,24,505,1063,,,,,689,,,2281 102 | default,row1,rack1,osd-node-003,osd.7,43,43,1,,,,,2744,16,113,1840,5,,,,770,,,2744 103 | default,row1,rack1,osd-node-003,osd.8,33,33,1,,,,,1390,3,26,954,,9,,,398,,,1390 104 | default,row1,rack1,osd-node-003,osd.9,34,34,1,,,,,2883,4,220,1548,,3,,,1108,,,2883 105 | default,row1,rack1,osd-node-004,osd.10,8,8,2,1,,1,,1994,13,1,925,1,1,1,,1052,,,1994 106 | default,row1,rack1,osd-node-004,osd.11,9,9,2,1,,1,,1133,1,,956,,,1,,175,,,1133 107 | ... 108 | default,row1,rack2,osd-node-029,,915,915,31,0,21,4,17,12717,341,2732,7155,32,38,1,0,2418,0,0,12717 109 | default,row1,rack2,osd-node-028,,496,497,26,0,16,2,14,17615,124,2223,12062,14,30,0,0,3162,0,0,17615 110 | default,row1,rack2,osd-node-027,,154,154,20,0,11,3,7,13095,224,1753,7253,12,54,14,0,3785,0,0,13095 111 | default,row1,rack2,osd-node-026,,445,445,22,0,12,3,9,15869,578,3750,7262,19,43,35,0,4182,0,0,15869 112 | default,row1,rack2,osd-node-025,,720,720,18,0,10,3,7,16185,123,1691,9394,14,30,3,0,4930,0,0,16185 113 | default,row1,rack2,osd-node-024,,882,882,24,0,13,4,10,21237,384,3710,8365,25,62,47,0,8643,1,0,21237 114 | default,row1,rack2,osd-node-023,,564,564,19,0,10,1,9,16237,38,1062,11968,1,30,5,0,3133,0,0,16237 115 | default,row1,rack2,osd-node-022,,521,521,18,0,9,1,8,21534,66,1261,14698,11,40,4,0,5454,0,0,21534 116 | ... 117 | 118 | Pool ID,Deep-Scrub: Average,Deep-Scrub: Count,Deep-Scrub: Max,Deep-Scrub: Min,Deep-Scrub: Total,Slow Primary,Slow Primary: commit_sent,Slow Primary: no flag points reached,Slow Primary: reached_pg,Slow Primary: started,Slow Primary: waiting for degraded object,Slow Primary: waiting for missing object,Slow Primary: waiting for ondisk,Slow Primary: waiting for rw locks,Slow Primary: waiting for scrub,Slow SubOp,Slow Total 119 | 1,289.468405,40,507.328,242.775,11578.7,,,,,,,,,,,, 120 | 2,0.137212,3,0.216364,0.035713,0.411637,,,,,,,,,,,, 121 | 3,0.036633,3,0.059088,0.0110841,0.1099,,,,,,,,,,,, 122 | 4,0.080645,3,0.115252,0.0404911,0.241935,221,,2,211,1,7,,,,,,221 123 | 5,0.159889,2,0.2046,0.115178,0.319778,,,,,,,,,,,, 124 | 6,0.065334,4,0.109835,0.029923,0.261337,,,,,,,,,,,, 125 | 7,0.029934,6,0.061202,0.00389504,0.179606,,,,,,,,,,,, 126 | 8,0.054197,3,0.0712051,0.02034,0.162592,,,,,,,,,,,, 127 | 9,209.420344,5061,1365.56,11.3129,1.05988e+06,455401,4806,50870,274801,441,987,246,2,123226,22,69519,524920 128 | 11,16.549503,2698,100.63,0.613281,44650.6,2,2,,,,,,,,,16,18 129 | 12,0.119071,2694,27.8345,0.00160193,320.777,3,,3,,,,,,,,,3 130 | 13,0.130721,3,0.267788,0.0144391,0.392163,,,,,,,,,,,, 131 | ``` 132 | ###### Example screenshots from Spreadsheet view: 133 | ###### Time histogram ( 10 minute interval ) 134 | ![Time Histogram](https://raw.githubusercontent.com/linuxkidd/ceph-log-parsers/master/images/Histogram.png) 135 | 136 | ###### OSD Chart with OSD Tree input 137 | ![OSD Chart](https://raw.githubusercontent.com/linuxkidd/ceph-log-parsers/master/images/OSDs.png) 138 | 139 | ###### Pool chart showing scrub and slow request counters 140 | ![Pool Chart](https://raw.githubusercontent.com/linuxkidd/ceph-log-parsers/master/images/Pools.png) 141 | 142 | #### deep-scrub_timing.awk 143 | - Provide the `ceph.log` and this script will provide an output showing the time between the start and stop of every deep-scrub. The output format is csv, with the first column being the deep-scrub time in seconds, second column being the 'deep-scrub' line which stopped the timer. The start/stop lines are keyed on the pg.id. At the end of the processing, a Min,Avg,Max output is also provided, along with the 'deep-scrub' completed line for the Min and Max processing times. 144 | 145 | ###### Example: 146 | ``` 147 | # ./deep-scrub_timing.awk /var/log/ceph/ceph.log > ~/deep-scrub_timings.csv 148 | # cat ~/deep-scrub_timings.csv 149 | 150 | 0.0155821,2018-01-16 03:44:06.068707 osd.764 10.129.152.42:6851/3796002 4467 : cluster [INF] 29.243 deep-scrub ok 151 | 0.0110428,2018-01-16 03:44:11.223353 osd.447 10.129.152.33:6851/3784262 4900 : cluster [INF] 29.5ad deep-scrub ok 152 | 0.0009799,2018-01-16 03:45:59.345522 osd.927 10.129.152.50:6836/2106288 6823 : cluster [INF] 20.e9 deep-scrub ok 153 | 0.002249,2018-01-16 03:46:04.488109 osd.284 10.129.152.30:6848/3526172 4303 : cluster [INF] 18.2f deep-scrub ok 154 | 0.000980854,2018-01-16 03:47:26.628785 osd.540 10.129.152.40:6824/4041304 5864 : cluster [INF] 23.238 deep-scrub ok 155 | 0.00139022,2018-01-16 03:47:27.402259 osd.684 10.129.152.42:6818/3777592 5148 : cluster [INF] 17.26d deep-scrub ok 156 | ... 157 | Min,Avg,Max 158 | 0.000564098,248.451,846.795 159 | Min Req: 2018-01-16 11:28:00.908817 osd.4 10.129.152.25:6837/3496196 5784 : cluster [INF] 48.32 deep-scrub ok 160 | Max Req: 2018-01-17 01:13:12.793967 osd.131 10.129.152.23:6814/3605203 3452 : cluster [INF] 30.7f7 deep-scrub ok 161 | ``` 162 | 163 | #### iops_histo.sh 164 | - Provide a 'ceph.log', this script will output a CSV file that can be graphed to understand the IOPs histogram for the time covered by the ceph.log. Left column is thousand IOPs, right column is how many 'pgmap' entries fall into that thousand. 165 | 166 | ###### Example: 167 | ``` 168 | # ./iops_histo.sh ceph.log > iops_histo.csv 169 | # cat iops_histo.csv 170 | 171 | 0,628 172 | 1,124 173 | 2,1986 174 | 3,8339 175 | 4,4218 176 | 5,3705 177 | 6,3233 178 | 7,2574 179 | 8,2013 180 | 9,1453 181 | 10,890 182 | 11,607 183 | 12,413 184 | 13,349 185 | 14,287 186 | 15,238 187 | 16,252 188 | 17,214 189 | 18,173 190 | ``` 191 | 192 | #### map_reporters_to_buckets.sh 193 | - Provide with a ceph-mon.log and text output file from 'ceph osd tree' and this script will generate a mapping of 'reported failed' (reported and reporters) counts as a result. 194 | 195 | ``` 196 | # ceph osd tree > ceph_osd_tree.txt 197 | # ./map_reporters_to_buckets.sh ceph-mon.log ceph_osd_tree.txt > reporters.csv 198 | Searching..., mapping to buckets 199 | 200 | # cat reporters.csv 201 | buckets...,reported,reporter 202 | default,rack1,ceph-storage-003,osd.0,2411,1520 203 | default,rack1,ceph-storage-003,osd.6,1880,2198 204 | default,rack1,ceph-storage-003,osd.10,2456,1663 205 | default,rack1,ceph-storage-003,osd.15,1978,2677 206 | ... 207 | default,rack1,ceph-storage-003,24256,22256, 208 | default,rack1,ceph-storage-004,osd.423,3869,1893 209 | default,rack1,ceph-storage-004,osd.425,3024,2832 210 | default,rack1,ceph-storage-004,osd.427,2219,2439 211 | ... 212 | default,rack1,ceph-storage-004,27784,21096, 213 | ... 214 | default,rack1,206045,167742, 215 | ... 216 | default,rack2,199356,137798, 217 | ... 218 | default,rack3,ceph-storage-046,osd.254,34761,46650 219 | default,rack3,ceph-storage-046,osd.259,32485,38331 220 | default,rack3,ceph-storage-046,osd.264,33657,48924 221 | default,rack3,ceph-storage-046,osd.269,31560,48421 222 | default,rack3,ceph-storage-046,309241,409805, 223 | ... 224 | default,rack3,313686,413547, 225 | default,719087,719087, 226 | 227 | ``` 228 | 229 | #### rgw_req_timing.sh 230 | - Provide the `radosgw.log` and this script will provide an output showing the time between the start and return of every RGW request. The output format is csv, with the first column being the request time in seconds, second column being the 'req done' line which stopped the timer. The start/stop lines are keyed on the request ID assigned by RGW. At the end of the processing, a Min,Avg,Max output is also provided, along with the 'req done' line for the Min and Max request times. 231 | 232 | ###### Example: 233 | ``` 234 | # ./rgw_req_timing.sh /var/log/ceph/ceph-rgw-myhostname.log > ~/req_timings.csv 235 | # cat ~/req_timings.csv 236 | 237 | 0.187219,2018-01-16 03:47:01.622215 2af878cd7700 1 ====== req done req=0x2af878cd1710 op status=0 http_status=200 ====== 238 | 0.051897,2018-01-16 03:47:01.989993 2af8a132d700 1 ====== req done req=0x2af8a1327710 op status=0 http_status=200 ====== 239 | 0.181928,2018-01-16 03:47:02.045216 2af878cd7700 1 ====== req done req=0x2af878cd1710 op status=0 http_status=200 ====== 240 | 0.052496,2018-01-16 03:47:02.047359 2af8a5335700 1 ====== req done req=0x2af8a532f710 op status=0 http_status=200 ====== 241 | 0.279186,2018-01-16 03:47:02.207797 2af87e7e5700 1 ====== req done req=0x2af87e7df710 op status=0 http_status=200 ====== 242 | 0.16574,2018-01-16 03:47:02.447974 2af878cd7700 1 ====== req done req=0x2af878cd1710 op status=0 http_status=200 ====== 243 | 0.29716,2018-01-16 03:47:02.712994 2af87e7e5700 1 ====== req done req=0x2af87e7df710 op status=0 http_status=200 ====== 244 | 0.186362,2018-01-16 03:47:02.828799 2af878cd7700 1 ====== req done req=0x2af878cd1710 op status=0 http_status=200 ====== 245 | 0.236106,2018-01-16 03:47:02.931637 2af88ab00700 1 ====== req done req=0x2af88aafa710 op status=0 http_status=200 ====== 246 | 0.0516322,2018-01-16 03:47:02.952181 2af87f0e7700 1 ====== req done req=0x2af87f0e1710 op status=0 http_status=200 ====== 247 | ... 248 | Min,Avg,Max 249 | 0.000127792,0.73737,1200.11 250 | Min Req: 2018-01-16 15:46:07.383273 2af89230f700 1 ====== req done req=0x2af892309710 op status=0 http_status=400 ====== 251 | Max Req: 2018-01-16 12:09:07.163211 2af89130d700 1 ====== req done req=0x2af891307710 op status=0 http_status=200 ====== 252 | ``` 253 | -------------------------------------------------------------------------------- /ceph_log_parser.awk: -------------------------------------------------------------------------------- 1 | #!/usr/bin/awk -f 2 | 3 | ####################################################### 4 | ####################################################### 5 | ## 6 | ## Run with ceph.log and redirect output to a CSV 7 | ## 8 | ## ./ceph_log_parser.awk ceph.log > ceph-log-parsed.csv 9 | ## ./ceph_log_parser.awk -v osdtree=ceph_osd_tree.txt -v timeinterval=60 -v bucketsummary=1 ceph.log > ceph-log-parsed.csv 10 | ## 11 | ## 12 | ## Available options: 13 | ## -v osdtree=ceph_osd_tree.txt 14 | ## If provided, the osd output portion will be output with its branch path in the crushmap 15 | ## 16 | ## -v timeinterval=(1|10|60|day) 17 | ## If provided, adjusts the time alignment for the histogram output. Default is 10 (minutes) 18 | ## 19 | ## -v bucketsummary=1 20 | ## If provided, provides an output below the OSD data summarizing the OSD counts for each 21 | ## successive bucket branch above the OSD ( example: host, rack, row, root ) 22 | ## Default is 1 if 'osdtree' is defined. 23 | ## 24 | ## -v osdhisto=1 25 | ## Provides a column per OSD in the time histogram showing initial 'slow request' entries 26 | ## incurred by that OSD during the time interval. 27 | ## Default is disabled because this can make VERY wide spreadsheets 28 | ## 29 | ## NOTE: These options MUST be specified **BEFORE** the ceph.log file, otherwise they will be 30 | ## ignored 31 | ## 32 | ## * For items which are average, these are summed and averaged over the measurement interval 33 | ## The measurement is reported at the beginning of the interval measurement period 34 | ## e.g IO: Client Read MB/s for 03:30 to 03:40 is averaged, then reported on the 03:30 line 35 | ## 36 | ## * For items which are a static snapshot, these are reported based on the last line containing those 37 | ## details in the log before the end of the measurement interval 38 | ## e.g. PG: active for 03:30 to 03:40 - If a pgmap is found at 03:39:59, that will be the one reported for 39 | ## the 03:30 line 40 | ## 41 | ## * For items like the Slow requests, the count of those entries is summed during the 10 minute period and reported 42 | ## e.g. If there are 50 'slow request ' logs in the 10 minute interval which are for a primary OSD, then 50 is reported 43 | ## If there are 50 'slow request ' logs 'waiting for subop', then the OSDs called out by the subop (comma 44 | ## separated numbers), are all counted in the 'Slow SubOp' line. For 3x replication, and 50 lines, the reported 45 | ## number would be 100 (due to 2x non-primary copies * 50 lines) 46 | ## 47 | ## 48 | ####################################################### 49 | ####################################################### 50 | 51 | 52 | 53 | function toMB(mynum,myunit) { 54 | myunit=tolower(myunit) 55 | if (myunit ~ /^b/) { mynum/=(1024*1024); } 56 | else if (myunit ~ /^kb/) { mynum/=1024; } 57 | else if (myunit ~ /^gb/) { mynum*=1024; } 58 | else if (myunit ~ /^tb/) { mynum*=1024*1024; } 59 | return sprintf("%0.2f",mynum) 60 | } 61 | 62 | function toTB(mynum,myunit) { 63 | myunit=tolower(myunit) 64 | if (myunit ~ /^b/) { mynum/=(1024*1024*1024*1024) } 65 | else if (myunit ~ /^kb/) { mynum/=(1024*1024*1024) } 66 | else if (myunit ~ /^mb/) { mynum/=(1024*1024) } 67 | else if (myunit ~ /^gb/) { mynum/=1024 } 68 | else if (myunit ~ /^pb/) { mynum*=1024 } 69 | else if (myunit ~ /^eb/) { mynum*=1024*1024 } 70 | return sprintf("%0.2f",mynum) 71 | } 72 | 73 | function join(array,sep) { 74 | if(1 in array) { 75 | result=array[1] 76 | arraylen=length(array) 77 | if(arraylen>1) { 78 | for(z=2;z<=arraylen;z++) 79 | result = result sep array[z] 80 | } 81 | } 82 | return result 83 | } 84 | 85 | function procbranch(myline) { 86 | split(myline,lineparts," ") 87 | if(lineparts[3] in branchtype) { 88 | if(currentdepth>branchtype[lineparts[3]]) { 89 | for(i=currentdepth;i>branchtype[lineparts[3]];i--) { 90 | delete prefix[i] 91 | delete branchtype[i] 92 | } 93 | delete prefix[branchtype[lineparts[3]]] 94 | } 95 | } else { 96 | currentdepth++ 97 | branchtype[lineparts[3]]=currentdepth 98 | } 99 | prefix[branchtype[lineparts[3]]]=lineparts[4] 100 | wasinhost=0 101 | } 102 | 103 | function procosd(myline) { 104 | split(myline,lineparts," ") 105 | outline=join(prefix,",") 106 | if(classenable==1) 107 | outline=outline","lineparts[2] 108 | osdpaths[lineparts[osdoffset]]=outline 109 | outline=outline","lineparts[osdoffset] 110 | osdpathsbypath[outline]=lineparts[osdoffset] 111 | if(currentdepth>maxpathdepth) 112 | maxpathdepth=currentdepth 113 | } 114 | 115 | function histoevent(mykey,myevent,myfunc,myvalue) { 116 | EVENTHEADERS[myevent]=1 117 | if(myfunc=="sum") 118 | EVENTCOUNT[mykey][myevent]+=myvalue 119 | else if(myfunc=="set") 120 | EVENTCOUNT[mykey][myevent]=myvalue 121 | else if(myfunc=="inc") 122 | EVENTCOUNT[mykey][myevent]++ 123 | } 124 | 125 | function histototal(myevent,myvalue) { 126 | EVENTTOTAL[myevent]+=myvalue 127 | } 128 | 129 | function osdhistoevent(mykey,myevent,myfunc,myvalue) { 130 | if(osdhisto!="") { 131 | OSDEVENTHEADERS[myevent]=1 132 | if(myfunc=="sum") 133 | OSDEVENTCOUNT[mykey][myevent]+=myvalue 134 | else if(myfunc=="set") 135 | OSDEVENTCOUNT[mykey][myevent]=myvalue 136 | else if(myfunc=="inc") 137 | OSDEVENTCOUNT[mykey][myevent]++ 138 | } 139 | } 140 | 141 | function osdhistototal(myevent,myvalue) { 142 | if(osdhisto!="") 143 | OSDEVENTTOTAL[myevent]+=myvalue 144 | } 145 | 146 | function osdevent(mykey,myevent,myfunc,myvalue) { 147 | OSDHEADERS[myevent]=1 148 | if(myfunc=="sum") 149 | OSDEVENT[mykey][myevent]+=myvalue 150 | else if(myfunc=="set") 151 | OSDEVENT[mykey][myevent]=myvalue 152 | else if(myfunc=="inc") 153 | OSDEVENT[mykey][myevent]++ 154 | } 155 | 156 | function osdtotal(myevent,myvalue) { 157 | OSDTOTAL[myevent]+=myvalue 158 | } 159 | 160 | function poolevent(mykey,myevent,myfunc,myvalue) { 161 | POOLHEADERS[myevent]=1 162 | if(myfunc=="sum") 163 | POOLEVENT[mykey][myevent]+=myvalue 164 | else if(myfunc=="set") 165 | POOLEVENT[mykey][myevent]=myvalue 166 | else if(myfunc=="inc") 167 | POOLEVENT[mykey][myevent]++ 168 | else if(myfunc=="max") { 169 | if(myvalue>POOLEVENT[pgparts[1]][myevent] || POOLEVENT[pgparts[1]][myevent] == "") 170 | POOLEVENT[pgparts[1]][myevent]=myvalue 171 | } else if(myfunc=="min") { 172 | if(myvalue 0 ) { 199 | split(line,osdtreeparts," ") 200 | switch (osdtreeparts[1]) { 201 | case "ID": 202 | classenable=0 203 | osdoffset=3 204 | if(osdtreeparts[2]=="CLASS") { 205 | classenable=1 206 | osdoffset=4 207 | } 208 | break 209 | case /^ *-/: 210 | procbranch(line) 211 | break 212 | case /^ *[0-9]/: 213 | wasinhost=1 214 | procosd(line) 215 | break 216 | } 217 | } 218 | } 219 | } 220 | 221 | / HEALTH_/ { 222 | MYDTSTAMP=mydtstamp($1" "$2) 223 | myline=$0 224 | gsub(";","",$9) 225 | histoevent(MYDTSTAMP,$9,"inc") 226 | myeventadd=0 227 | split(myline,mylineparts,"; ") 228 | 229 | for(linepartindex in mylineparts) { 230 | if(mylineparts[linepartindex] ~ /flag\(s\) set/) { 231 | split(mylineparts[linepartindex],linespaced," ") 232 | split(linespaced[1],flags,",") 233 | for(flagidx in flags) { 234 | histoevent(MYDTSTAMP,"Flag: "flags[flagidx],"inc") 235 | } 236 | } 237 | } 238 | 239 | } 240 | 241 | / pgmap / { 242 | MYDTSTAMP=mydtstamp($1" "$2) 243 | myline=$0 244 | myeventadd=0 245 | split(myline,mylineparts,";") 246 | 247 | for(linepartindex in mylineparts) { 248 | switch (mylineparts[linepartindex]) { 249 | case / pgs: /: 250 | split(mylineparts[linepartindex],junka,":") 251 | split(junka[7],pgstats,",") 252 | 253 | # Reset the counts so that only the last line in a measured interval is accumulated 254 | if(MYDTSTAMP in EVENTCOUNT) { 255 | for(key in EVENTCOUNT[MYDTSTAMP]) 256 | if(key ~ /^PG: /) 257 | delete EVENTCOUNT[MYDTSTAMP][key] 258 | } 259 | 260 | for(pgstatindex in pgstats) { 261 | pgstat=pgstats[pgstatindex] 262 | split(pgstat,statparts," ") 263 | split(statparts[2],pgstate,"+") 264 | for(pgstateindex in pgstate) { 265 | myeventname="PG: "pgstate[pgstateindex] 266 | histoevent(MYDTSTAMP,myeventname,"sum",statparts[1]) 267 | } 268 | } 269 | break 270 | case / avail$/: 271 | split(mylineparts[linepartindex],clusterspace,",") 272 | for(spaceindex in clusterspace) { 273 | split(clusterspace[spaceindex],myspaceparts," ") 274 | if(myspaceparts[3] ~ /^data/) { 275 | histoevent(MYDTSTAMP,"Space (TB): Data Stored","set",toTB(myspaceparts[1],myspaceparts[2])) 276 | } else if(myspaceparts[3] ~ /^used/) { 277 | histoevent(MYDTSTAMP,"Space (TB): Raw Used","set",toTB(myspaceparts[1],myspaceparts[2])) 278 | } else if(6 in myspaceparts) { 279 | histoevent(MYDTSTAMP,"Space (TB): Free","set",toTB(myspaceparts[1],myspaceparts[2])) 280 | histoevent(MYDTSTAMP,"Space (TB): Total","set",toTB(myspaceparts[4],myspaceparts[5])) 281 | } 282 | } 283 | break 284 | case /op\/s/: 285 | split(mylineparts[linepartindex],clilineparts,",") 286 | for(clilpindex in clilineparts) { 287 | split(clilineparts[clilpindex],mycliparts," ") 288 | if(3 in mycliparts) { 289 | myeventadd=toMB(mycliparts[1],mycliparts[2]) 290 | if(mycliparts[3] ~ /^rd/) { 291 | myeventname="IO: Client Avg Read MB/s" 292 | myeventcount="Client Read Count" 293 | } 294 | else if(mycliparts[3] ~ /^wr/) { 295 | myeventname="IO: Client Avg Write MB/s" 296 | myeventcount="Client Write Count" 297 | } 298 | } else { 299 | myeventname="IO: Client Avg IOPs" 300 | myeventadd=mycliparts[1] 301 | myeventcount="Client IOPsCount" 302 | } 303 | histoevent(MYDTSTAMP,myeventname,"set",sprintf("%0.2f",((EVENTCOUNT[MYDTSTAMP][myeventname]*EVENTCOUNT[MYDTSTAMP][myeventcount])+myeventadd)/(EVENTCOUNT[MYDTSTAMP][myeventcount]+1))) 304 | EVENTCOUNT[MYDTSTAMP][myeventcount]++ 305 | } 306 | break 307 | case / objects degraded /: 308 | split(mylineparts[linepartindex],degradeobj," ") 309 | gsub(/[^0-9\.]/,"",degradeobj[4]) 310 | histoevent(MYDTSTAMP,"Objects: Degraded Percent","set",degradeobj[4]) 311 | break 312 | case / objects misplaced /: 313 | split(mylineparts[linepartindex],degradeobj," ") 314 | gsub(/[^0-9\.]/,"",degradeobj[4]) 315 | histoevent(MYDTSTAMP,"Objects: Misplaced Percent","set",degradeobj[4]) 316 | break 317 | case / recovering$/: 318 | myeventname="IO: Recovery Avg MB/s" 319 | myeventcount="RecoveryCount" 320 | split(mylineparts[linepartindex],reclineparts," ") 321 | myeventadd=toMB(reclineparts[1],reclineparts[2]) 322 | histoevent(MYDTSTAMP,myeventname,"set",sprintf("%0.2f",((EVENTCOUNT[MYDTSTAMP][myeventname]*EVENTCOUNT[MYDTSTAMP][myeventcount])+myeventadd)/(EVENTCOUNT[MYDTSTAMP][myeventcount]+1))) 323 | EVENTCOUNT[MYDTSTAMP][myeventcount]++ 324 | break 325 | } 326 | } 327 | } 328 | 329 | / deep-scrub / { 330 | MYDTSTAMP=mydtstamp($1" "$2) 331 | MYPG=$9 332 | MYDATE=$1 333 | MYTIME=$2 334 | gsub(/[-:]/," ",MYDATE) 335 | gsub(/[-:]/," ",MYTIME) 336 | MYTIME=mktime(MYDATE" "MYTIME) 337 | split($2,secs,".") 338 | millisecs=sprintf("0.%s",secs[2]) 339 | MYTIME+=millisecs 340 | 341 | if($NF == "starts") { 342 | MYEVENT="Deep-Scrub: Starts" 343 | histoevent(MYDTSTAMP,MYEVENT,"inc") 344 | osdevent($3,MYEVENT,"inc") 345 | osdtotal(MYEVENT,1) 346 | histototal(MYEVENT,1) 347 | MYSTART[MYPG]=MYTIME 348 | } 349 | else { 350 | if(MYSTART[MYPG]!="") { 351 | mydiff=MYTIME-MYSTART[MYPG] 352 | split(MYPG,pgparts,".") 353 | poolevent(pgparts[1],"Deep-Scrub: Count","inc") 354 | poolevent(pgparts[1],"Deep-Scrub: Total","sum",mydiff) 355 | poolevent(pgparts[1],"Deep-Scrub: Min","min",mydiff) 356 | poolevent(pgparts[1],"Deep-Scrub: Max","max",mydiff) 357 | } 358 | if($NF == "ok") { 359 | MYEVENT="Deep-Scrub: OK" 360 | histoevent(MYDTSTAMP,MYEVENT,"inc") 361 | histototal(MYEVENT,1) 362 | osdevent($3,MYEVENT,"inc") 363 | osdtotal(MYEVENT,1) 364 | } else { 365 | MYEVENT="Deep-Scrub: Not OK" 366 | histoevent(MYDTSTAMP,MYEVENT,"inc") 367 | histototal(MYEVENT,1) 368 | osdevent($3,MYEVENT,"inc") 369 | osdtotal(MYEVENT,1) 370 | } 371 | } 372 | } 373 | 374 | /slow request / { 375 | MYDTSTAMP=mydtstamp($1" "$2) 376 | MYLINE=$0 377 | split(MYLINE,myparts,":") 378 | split(myparts[9],opparts," ") 379 | if (opparts[2] ~ /^[0-9]*\.[0-9a-f]*$/) 380 | split(opparts[2],pgid,".") 381 | else if (opparts[9] ~ /^[0-9]*\.[0-9a-f]*$/) 382 | split(opparts[9],pgid,".") 383 | 384 | if ($0 ~ /subops from/) { 385 | split($NF,subosds,",") 386 | for (subosdidx in subosds) { 387 | subosd="osd."subosds[subosdidx] 388 | if($11 < 60) { 389 | myeventstring="Slow SubOp,Slow Total" 390 | osdhistoevent(MYDTSTAMP,subosd,"inc") 391 | osdhistototal(subosd,"inc") 392 | } else { 393 | myeventstring="Slow Relog SubOp,Slow Relog Total" 394 | } 395 | split(myeventstring,myevents,",") 396 | for(myevent in myevents) { 397 | histototal(myevents[myevent],1) 398 | histoevent(MYDTSTAMP,myevents[myevent],"inc") 399 | osdevent(subosd,myevents[myevent],"inc") 400 | osdtotal(myevents[myevent],1) 401 | poolevent(pgid[1],myevents[myevent],"inc") 402 | } 403 | } 404 | } else { 405 | MYTYPE=$0 406 | mytpartcount=split($0,mytparts," currently ") 407 | MYTYPE="Slow Primary: "mytparts[mytpartcount] 408 | if($11 < 60) { 409 | myeventstring="Slow Primary,Slow Total,"MYTYPE 410 | osdhistoevent(MYDTSTAMP,$3,"inc") 411 | osdhistototal($3,"inc") 412 | } else { 413 | myeventstring="Slow Relog Primary,Slow Relog Total" 414 | } 415 | split(myeventstring,myevents,",") 416 | for(myevent in myevents) { 417 | histoevent(MYDTSTAMP,myevents[myevent],"inc") 418 | histototal(myevents[myevent],1) 419 | osdevent($3,myevents[myevent],"inc") 420 | osdtotal(myevents[myevent],1) 421 | poolevent(pgid[1],myevents[myevent],"inc") 422 | } 423 | } 424 | } 425 | 426 | / osdmap / { 427 | MYDTSTAMP=mydtstamp($1" "$2) 428 | histoevent(MYDTSTAMP,"OSDs","set",$11) 429 | histoevent(MYDTSTAMP,"OSDs UP","set",$13) 430 | histoevent(MYDTSTAMP,"OSDs IN","set",$15) 431 | } 432 | 433 | / osd\.[0-9]* out / { 434 | MYDTSTAMP=mydtstamp($1" "$2) 435 | MYEVENT="OSD Out" 436 | histoevent(MYDTSTAMP,MYEVENT,"inc") 437 | histototal(MYEVENT,1) 438 | osdevent($9,MYEVENT,"inc") 439 | osdtotal(MYEVENT,1) 440 | } 441 | 442 | / wrongly marked me down$/ { 443 | MYDTSTAMP=mydtstamp($1" "$2) 444 | MYEVENT="OSD Wrongly" 445 | histoevent(MYDTSTAMP,MYEVENT,"inc") 446 | histototal(MYEVENT,1) 447 | osdevent($3,MYEVENT,"inc") 448 | osdtotal(MYEVENT,1) 449 | } 450 | 451 | / marked itself down/ { 452 | MYDTSTAMP=mydtstamp($1" "$2) 453 | MYEVENT="OSD Down: Self" 454 | histoevent(MYDTSTAMP,MYEVENT,"inc") 455 | histototal(MYEVENT,1) 456 | osdevent($9,MYEVENT,"inc") 457 | osdtotal(MYEVENT,1) 458 | } 459 | 460 | / failed .*reports from / { 461 | MYDTSTAMP=mydtstamp($1" "$2) 462 | MYEVENT="OSD Down: Reported Failed" 463 | histoevent(MYDTSTAMP,MYEVENT,"inc") 464 | histototal(MYEVENT,1) 465 | osdevent($9,MYEVENT,"inc") 466 | osdtotal(MYEVENT,1) 467 | } 468 | 469 | / marked down after no pg stats for / { 470 | MYDTSTAMP=mydtstamp($1" "$2) 471 | MYEVENT="OSD Down: No PG stats" 472 | histoevent(MYDTSTAMP,MYEVENT,"inc") 473 | histototal(MYEVENT,1) 474 | osdevent($9,MYEVENT,"inc") 475 | osdtotal(MYEVENT,1) 476 | } 477 | 478 | / boot$/ { 479 | MYDTSTAMP=mydtstamp($1" "$2) 480 | MYEVENT="OSD Boot" 481 | histoevent(MYDTSTAMP,MYEVENT,"inc") 482 | histototal(MYEVENT,1) 483 | osdevent($9,MYEVENT,"inc") 484 | osdtotal(MYEVENT,1) 485 | } 486 | 487 | END { 488 | 489 | ## Begin outputting the histogram chart 490 | printf("DateTime") 491 | n=asorti(EVENTHEADERS) 492 | if(osdhisto!="") 493 | osdn=asorti(OSDEVENTHEADERS) 494 | for (i = 1; i<= n; i++ ) 495 | printf(",%s",EVENTHEADERS[i]) 496 | if(osdhisto!="") { 497 | for (i = 1; i<= osdn; i++) 498 | printf(",%s",OSDEVENTHEADERS[i]) 499 | } 500 | 501 | printf("\n") 502 | 503 | dtcount=asorti(EVENTCOUNT,DTS) 504 | 505 | for (dtindex =1; dtindex <= dtcount; dtindex++) { 506 | DT=DTS[dtindex] 507 | printf("%s:00", DT) 508 | for (i = 1; i<= n; i++ ) 509 | printf(",%s",EVENTCOUNT[DT][EVENTHEADERS[i]]) 510 | if(osdhisto!="") { 511 | # add-on the per OSD histo columns 512 | for (i = 1; i<= osdn; i++ ) 513 | printf(",%s",OSDEVENTCOUNT[DT][OSDEVENTHEADERS[i]]) 514 | } 515 | printf("\n") 516 | } 517 | 518 | ## Begin outputting the column totals line 519 | printf("Totals") 520 | for (i = 1; i<= n; i++ ) 521 | printf(",%s",EVENTTOTAL[EVENTHEADERS[i]]) 522 | if(osdhisto!="") { 523 | for (i = 1; i<= osdn; i++ ) 524 | printf(",%s",OSDEVENTTOTAL[OSDEVENTHEADERS[i]]) 525 | } 526 | 527 | printf("\n") 528 | printf("\n") 529 | 530 | ## Begin outputting the OSD chart 531 | o=asorti(OSDHEADERS,OHDR) 532 | 533 | if(osdtree != "") { 534 | printf("OSD Tree Path,") 535 | for(pathindex=2;pathindex<=maxpathdepth;pathindex++) 536 | printf(",") 537 | } 538 | 539 | printf("osd.id") 540 | for (i = 1; i<= o; i++ ) { 541 | printf(",%s",OHDR[i]) 542 | } 543 | printf("\n") 544 | 545 | if(osdtree=="") { 546 | for (OSD in OSDEVENT) { 547 | gsub(/^osd\./,"",OSD) 548 | OSDS[OSD]=OSD 549 | } 550 | osdcount=asort(OSDS) 551 | } else { 552 | osdcount=asorti(osdpathsbypath,OSDS) 553 | } 554 | for (osdindex=1; osdindex<=osdcount; osdindex++) { 555 | if(osdtree=="") 556 | osd="osd."OSDS[osdindex] 557 | else { 558 | osd=OSDS[osdindex] 559 | split(OSDS[osdindex],osdparts,",") 560 | osd=osdparts[length(osdparts)] 561 | 562 | printf("%s,",osdpaths[osd]) 563 | split(osdpaths[osd],pathjunk,",") 564 | pathdepth=length(pathjunk) 565 | if(pathdepth=1;bindex--) { 608 | printf("%s,",BKS[bindex]) 609 | split(BKS[bindex],bucketjunk,",") 610 | junklen=length(bucketjunk) 611 | for(i=junklen; i< maxpathdepth; i++) 612 | printf(",") 613 | for (i = 1; i<= o; i++ ) { 614 | if(BUCKETSUMMARY[BKS[bindex]][OHDR[i]]>0) 615 | printf(",%s",BUCKETSUMMARY[BKS[bindex]][OHDR[i]]) 616 | else 617 | printf(",") 618 | } 619 | printf("\n") 620 | } 621 | } else { 622 | ## Or print column totals if Bucket Summary is not selected 623 | printf("Totals") 624 | if(osdtree != "") { 625 | for(pathindex=2;pathindex<=maxpathdepth;pathindex++) 626 | printf(",") 627 | } 628 | for (i = 1; i<= o; i++ ) { 629 | printf(",%s",OSDTOTAL[OHDR[i]]) 630 | } 631 | } 632 | 633 | printf("\n\n") 634 | 635 | ## Begin outputting the Pool summary chart 636 | POOLHEADERS["Deep-Scrub: Average"]=1 637 | poolcount=asorti(POOLEVENT,poolids) 638 | phdrcount=asorti(POOLHEADERS,PHDR) 639 | printf("Pool ID") 640 | for(phdrindex=1;phdrindex<=phdrcount;phdrindex++) 641 | printf(",%s",PHDR[phdrindex]) 642 | printf("\n") 643 | for(pindex=1;pindex<=poolcount;pindex++) { 644 | printf("%s",poolids[pindex]) 645 | for(phdrindex=1;phdrindex<=phdrcount;phdrindex++) { 646 | if(PHDR[phdrindex]=="Deep-Scrub: Average") { 647 | if(POOLEVENT[poolids[pindex]]["Deep-Scrub: Count"]) 648 | printf(",%0.6f",POOLEVENT[poolids[pindex]]["Deep-Scrub: Total"]/POOLEVENT[poolids[pindex]]["Deep-Scrub: Count"]) 649 | else 650 | printf(",") 651 | } else 652 | printf(",%s",POOLEVENT[poolids[pindex]][PHDR[phdrindex]]) 653 | } 654 | printf("\n") 655 | } 656 | } 657 | 658 | 659 | -------------------------------------------------------------------------------- /ceph_log_parser.luminous.awk: -------------------------------------------------------------------------------- 1 | #!/usr/bin/awk -f 2 | 3 | ####################################################### 4 | ####################################################### 5 | ## 6 | ## Run with ceph.log and redirect output to a CSV 7 | ## 8 | ## ./ceph_log_parser.awk ceph.log > ceph-log-parsed.csv 9 | ## ./ceph_log_parser.awk -v osdtree=ceph_osd_tree.txt -v timeinterval=60 -v bucketsummary=1 ceph.log > ceph-log-parsed.csv 10 | ## 11 | ## 12 | ## Available options: 13 | ## -v osdtree=ceph_osd_tree.txt 14 | ## If provided, the osd output portion will be output with its branch path in the crushmap 15 | ## 16 | ## -v timeinterval=(1|10|60|day) 17 | ## If provided, adjusts the time alignment for the histogram output. Default is 10 (minutes) 18 | ## 19 | ## -v bucketsummary=1 20 | ## If provided, provides an output below the OSD data summarizing the OSD counts for each 21 | ## successive bucket branch above the OSD ( example: host, rack, row, root ) 22 | ## Default is 1 if 'osdtree' is defined. 23 | ## 24 | ## -v osdhisto=1 25 | ## Provides a column per OSD in the time histogram showing initial 'slow request' entries 26 | ## incurred by that OSD during the time interval. 27 | ## Default is disabled because this can make VERY wide spreadsheets 28 | ## 29 | ## NOTE: These options MUST be specified **BEFORE** the ceph.log file, otherwise they will be 30 | ## ignored 31 | ## 32 | ## * For items which are average, these are summed and averaged over the measurement interval 33 | ## The measurement is reported at the beginning of the interval measurement period 34 | ## e.g IO: Client Read MB/s for 03:30 to 03:40 is averaged, then reported on the 03:30 line 35 | ## 36 | ## * For items which are a static snapshot, these are reported based on the last line containing those 37 | ## details in the log before the end of the measurement interval 38 | ## e.g. PG: active for 03:30 to 03:40 - If a pgmap is found at 03:39:59, that will be the one reported for 39 | ## the 03:30 line 40 | ## 41 | ## * For items like the Slow requests, the count of those entries is summed during the 10 minute period and reported 42 | ## e.g. If there are 50 'slow request ' logs in the 10 minute interval which are for a primary OSD, then 50 is reported 43 | ## If there are 50 'slow request ' logs 'waiting for subop', then the OSDs called out by the subop (comma 44 | ## separated numbers), are all counted in the 'Slow SubOp' line. For 3x replication, and 50 lines, the reported 45 | ## number would be 100 (due to 2x non-primary copies * 50 lines) 46 | ## 47 | ## 48 | ####################################################### 49 | ####################################################### 50 | 51 | 52 | 53 | function toMB(mynum,myunit) { 54 | myunit=tolower(myunit) 55 | if (myunit ~ /^b/) { mynum/=(1024*1024); } 56 | else if (myunit ~ /^kb/) { mynum/=1024; } 57 | else if (myunit ~ /^gb/) { mynum*=1024; } 58 | else if (myunit ~ /^tb/) { mynum*=1024*1024; } 59 | return sprintf("%0.2f",mynum) 60 | } 61 | 62 | function toTB(mynum,myunit) { 63 | myunit=tolower(myunit) 64 | if (myunit ~ /^b/) { mynum/=(1024*1024*1024*1024) } 65 | else if (myunit ~ /^kb/) { mynum/=(1024*1024*1024) } 66 | else if (myunit ~ /^mb/) { mynum/=(1024*1024) } 67 | else if (myunit ~ /^gb/) { mynum/=1024 } 68 | else if (myunit ~ /^pb/) { mynum*=1024 } 69 | else if (myunit ~ /^eb/) { mynum*=1024*1024 } 70 | return sprintf("%0.2f",mynum) 71 | } 72 | 73 | function join(array,sep) { 74 | if(1 in array) { 75 | result=array[1] 76 | arraylen=length(array) 77 | if(arraylen>1) { 78 | for(z=2;z<=arraylen;z++) 79 | result = result sep array[z] 80 | } 81 | } 82 | return result 83 | } 84 | 85 | function procbranch(myline) { 86 | split(myline,lineparts," ") 87 | if(lineparts[3] in branchtype) { 88 | if(currentdepth>branchtype[lineparts[3]]) { 89 | for(i=currentdepth;i>branchtype[lineparts[3]];i--) { 90 | delete prefix[i] 91 | delete branchtype[i] 92 | } 93 | delete prefix[branchtype[lineparts[3]]] 94 | } 95 | } else { 96 | currentdepth++ 97 | branchtype[lineparts[3]]=currentdepth 98 | } 99 | prefix[branchtype[lineparts[3]]]=lineparts[4] 100 | wasinhost=0 101 | } 102 | 103 | function procosd(myline) { 104 | split(myline,lineparts," ") 105 | outline=join(prefix,",") 106 | if(classenable==1) 107 | outline=outline","lineparts[2] 108 | osdpaths[lineparts[osdoffset]]=outline 109 | outline=outline","lineparts[osdoffset] 110 | osdpathsbypath[outline]=lineparts[osdoffset] 111 | if(currentdepth>maxpathdepth) 112 | maxpathdepth=currentdepth 113 | } 114 | 115 | function histoevent(mykey,myevent,myfunc,myvalue) { 116 | EVENTHEADERS[myevent]=1 117 | if(myfunc=="sum") 118 | EVENTCOUNT[mykey][myevent]+=myvalue 119 | else if(myfunc=="set") 120 | EVENTCOUNT[mykey][myevent]=myvalue 121 | else if(myfunc=="inc") 122 | EVENTCOUNT[mykey][myevent]++ 123 | } 124 | 125 | function histototal(myevent,myvalue) { 126 | EVENTTOTAL[myevent]+=myvalue 127 | } 128 | 129 | function osdhistoevent(mykey,myevent,myfunc,myvalue) { 130 | if(osdhisto!="") { 131 | OSDEVENTHEADERS[myevent]=1 132 | if(myfunc=="sum") 133 | OSDEVENTCOUNT[mykey][myevent]+=myvalue 134 | else if(myfunc=="set") 135 | OSDEVENTCOUNT[mykey][myevent]=myvalue 136 | else if(myfunc=="inc") 137 | OSDEVENTCOUNT[mykey][myevent]++ 138 | } 139 | } 140 | 141 | function osdhistototal(myevent,myvalue) { 142 | if(osdhisto!="") 143 | OSDEVENTTOTAL[myevent]+=myvalue 144 | } 145 | 146 | function osdevent(mykey,myevent,myfunc,myvalue) { 147 | OSDHEADERS[myevent]=1 148 | if(myfunc=="sum") 149 | OSDEVENT[mykey][myevent]+=myvalue 150 | else if(myfunc=="set") 151 | OSDEVENT[mykey][myevent]=myvalue 152 | else if(myfunc=="inc") 153 | OSDEVENT[mykey][myevent]++ 154 | } 155 | 156 | function osdtotal(myevent,myvalue) { 157 | OSDTOTAL[myevent]+=myvalue 158 | } 159 | 160 | function poolevent(mykey,myevent,myfunc,myvalue) { 161 | POOLHEADERS[myevent]=1 162 | if(myfunc=="sum") 163 | POOLEVENT[mykey][myevent]+=myvalue 164 | else if(myfunc=="set") 165 | POOLEVENT[mykey][myevent]=myvalue 166 | else if(myfunc=="inc") 167 | POOLEVENT[mykey][myevent]++ 168 | else if(myfunc=="max") { 169 | if(myvalue>POOLEVENT[pgparts[1]][myevent] || POOLEVENT[pgparts[1]][myevent] == "") 170 | POOLEVENT[pgparts[1]][myevent]=myvalue 171 | } else if(myfunc=="min") { 172 | if(myvalue 0 ) { 199 | split(line,osdtreeparts," ") 200 | switch (osdtreeparts[1]) { 201 | case "ID": 202 | classenable=0 203 | osdoffset=3 204 | if(osdtreeparts[2]=="CLASS") { 205 | classenable=1 206 | osdoffset=4 207 | } 208 | break 209 | case /^ *-/: 210 | procbranch(line) 211 | break 212 | case /^ *[0-9]/: 213 | wasinhost=1 214 | procosd(line) 215 | break 216 | } 217 | } 218 | } 219 | } 220 | 221 | / overall HEALTH/ { 222 | if($NF == "HEALTH_OK") 223 | next 224 | MYDTSTAMP=mydtstamp($1" "$2) 225 | myline=$0 226 | myeventadd=0 227 | split(myline,mlpa," : ") 228 | split(mlpa[2],mylineparts,";") 229 | 230 | for(linepartindex in mylineparts) { 231 | switch (mylineparts[linepartindex]) { 232 | case / osds down$/: 233 | split(mylineparts[linepartindex],osdparts," ") 234 | histoevent(MYDTSTAMP,"OSDs down","set",osdparts[5]) 235 | break 236 | case / host.*down$/: 237 | split(mylineparts[linepartindex],hostparts," ") 238 | histoevent(MYDTSTAMP,"HOSTs down","set",hostparts[1]) 239 | break 240 | case /Reduced data availability: /: 241 | case /Possible data damage: /: 242 | split(mylineparts[linepartindex],linepartA,":") 243 | split(linepartA[2],linepartB,",") 244 | for(field in linepartB) { 245 | split(linepartB[field],fparts," ") 246 | myevent="PG: "fparts[3] 247 | histoevent(MYDTSTAMP,myevent,"set",fparts[1]) 248 | } 249 | break 250 | case /Degraded data redundancy: /: 251 | split(mylineparts[linepartindex],linepartA,":") 252 | split(linepartA[2],linepartB,",") 253 | for(field in linepartB) { 254 | if(linepartB[field] ~ /objects degraded/) { 255 | split(linepartB[field],linepartC," ") 256 | gsub(/[^0-9\.]/,"",linepartC[4]) 257 | histoevent(MYDTSTAMP,"Objects: Degraded Percent","set",linepartC[4]) 258 | } else { 259 | split(linepartB[field],fparts," ") 260 | myevent="PG: "fparts[3] 261 | histoevent(MYDTSTAMP,myevent,"set",fparts[1]) 262 | } 263 | } 264 | break 265 | case / objects misplaced /: 266 | split(mylineparts[linepartindex],degradeobj," ") 267 | gsub(/[^0-9\.]/,"",degradeobj[4]) 268 | histoevent(MYDTSTAMP,"Objects: Misplaced Percent","set",degradeobj[4]) 269 | break 270 | } 271 | } 272 | } 273 | 274 | / deep-scrub / { 275 | MYDTSTAMP=mydtstamp($1" "$2) 276 | MYPG=$9 277 | MYDATE=$1 278 | MYTIME=$2 279 | gsub(/[-:]/," ",MYDATE) 280 | gsub(/[-:]/," ",MYTIME) 281 | MYTIME=mktime(MYDATE" "MYTIME) 282 | split($2,secs,".") 283 | millisecs=sprintf("0.%s",secs[2]) 284 | MYTIME+=millisecs 285 | 286 | if($NF == "starts") { 287 | MYEVENT="Deep-Scrub: Starts" 288 | histoevent(MYDTSTAMP,MYEVENT,"inc") 289 | osdevent($3,MYEVENT,"inc") 290 | osdtotal(MYEVENT,1) 291 | histototal(MYEVENT,1) 292 | MYSTART[MYPG]=MYTIME 293 | } 294 | else { 295 | if(MYSTART[MYPG]!="") { 296 | mydiff=MYTIME-MYSTART[MYPG] 297 | split(MYPG,pgparts,".") 298 | poolevent(pgparts[1],"Deep-Scrub: Count","inc") 299 | poolevent(pgparts[1],"Deep-Scrub: Total","sum",mydiff) 300 | poolevent(pgparts[1],"Deep-Scrub: Min","min",mydiff) 301 | poolevent(pgparts[1],"Deep-Scrub: Max","max",mydiff) 302 | } 303 | if($NF == "ok") { 304 | MYEVENT="Deep-Scrub: OK" 305 | histoevent(MYDTSTAMP,MYEVENT,"inc") 306 | histototal(MYEVENT,1) 307 | osdevent($3,MYEVENT,"inc") 308 | osdtotal(MYEVENT,1) 309 | } else { 310 | MYEVENT="Deep-Scrub: Not OK" 311 | histoevent(MYDTSTAMP,MYEVENT,"inc") 312 | histototal(MYEVENT,1) 313 | osdevent($3,MYEVENT,"inc") 314 | osdtotal(MYEVENT,1) 315 | } 316 | } 317 | } 318 | 319 | /slow request / { 320 | MYDTSTAMP=mydtstamp($1" "$2) 321 | MYLINE=$0 322 | split(MYLINE,myparts,":") 323 | split(myparts[9],opparts," ") 324 | if (opparts[2] ~ /^[0-9]*\.[0-9a-f]*$/) 325 | split(opparts[2],pgid,".") 326 | else if (opparts[9] ~ /^[0-9]*\.[0-9a-f]*/) 327 | split(opparts[9],pgid,".") 328 | else if (myparts[8] ~ /pg_update_log_missing/) { 329 | split(myparts[8],temppgid," ") 330 | gsub(/^.*\(/,"",temppgid[1]) 331 | split(temppgid[1],pgid,".") 332 | } 333 | 334 | if ($0 ~ / subops /) { 335 | split($0,junk," currently ") 336 | MYTYPE="Slow SubOp: "junk[2] 337 | gsub(/ [0-9,]*$/,"",MYTYPE) 338 | split($NF,subosds,",") 339 | for (subosd in subosds) { 340 | subosd="osd."subosds[subosd] 341 | if($12 < 60) { 342 | myeventstring="Slow SubOp,Slow Total,"MYTYPE 343 | osdhistoevent(MYDTSTAMP,subosd,"inc") 344 | osdhistototal(subosd,"inc") 345 | } else { 346 | myeventstring="Slow Relog SubOp,Slow Relog Total" 347 | } 348 | split(myeventstring,myevents,",") 349 | for(myevent in myevents) { 350 | histototal(myevents[myevent],1) 351 | histoevent(MYDTSTAMP,myevents[myevent],"inc") 352 | osdevent(subosd,myevents[myevent],"inc") 353 | osdtotal(myevents[myevent],1) 354 | poolevent(pgid[1],myevents[myevent],"inc") 355 | } 356 | } 357 | } else { 358 | split($0,junk," currently ") 359 | MYTYPE="Slow Primary: "junk[2] 360 | gsub(/ from [0-9]*/,"",MYTYPE) 361 | if($12 < 60) { 362 | myeventstring="Slow Primary,Slow Total,"MYTYPE 363 | osdhistoevent(MYDTSTAMP,$3,"inc") 364 | osdhistototal($3,"inc") 365 | } else { 366 | myeventstring="Slow Relog Primary,Slow Relog Total" 367 | } 368 | split(myeventstring,myevents,",") 369 | for(myevent in myevents) { 370 | histoevent(MYDTSTAMP,myevents[myevent],"inc") 371 | histototal(myevents[myevent],1) 372 | osdevent($3,myevents[myevent],"inc") 373 | osdtotal(myevents[myevent],1) 374 | poolevent(pgid[1],myevents[myevent],"inc") 375 | } 376 | } 377 | } 378 | 379 | / osdmap / { 380 | MYDTSTAMP=mydtstamp($1" "$2) 381 | histoevent(MYDTSTAMP,"OSDs","set",$11) 382 | histoevent(MYDTSTAMP,"OSDs UP","set",$13) 383 | histoevent(MYDTSTAMP,"OSDs IN","set",$15) 384 | } 385 | 386 | / osd\.[0-9]* out / { 387 | MYDTSTAMP=mydtstamp($1" "$2) 388 | MYEVENT="OSD Out" 389 | histoevent(MYDTSTAMP,MYEVENT,"inc") 390 | histototal(MYEVENT,1) 391 | if($9 ~ /^osd\./) 392 | osdpos=9 393 | if($11 ~ /^osd\./) 394 | osdpos=11 395 | osdevent($osdpos,MYEVENT,"inc") 396 | osdtotal(MYEVENT,1) 397 | } 398 | 399 | / but it is still running$/ { 400 | MYDTSTAMP=mydtstamp($1" "$2) 401 | MYEVENT="OSD Wrongly" 402 | histoevent(MYDTSTAMP,MYEVENT,"inc") 403 | histototal(MYEVENT,1) 404 | osdevent($3,MYEVENT,"inc") 405 | osdtotal(MYEVENT,1) 406 | } 407 | 408 | / wrongly marked me down$/ { 409 | MYDTSTAMP=mydtstamp($1" "$2) 410 | MYEVENT="OSD Wrongly" 411 | histoevent(MYDTSTAMP,MYEVENT,"inc") 412 | histototal(MYEVENT,1) 413 | osdevent($3,MYEVENT,"inc") 414 | osdtotal(MYEVENT,1) 415 | } 416 | 417 | / marked itself down / { 418 | MYDTSTAMP=mydtstamp($1" "$2) 419 | MYEVENT="OSD Down: Self" 420 | histoevent(MYDTSTAMP,MYEVENT,"inc") 421 | histototal(MYEVENT,1) 422 | osdevent($9,MYEVENT,"inc") 423 | osdtotal(MYEVENT,1) 424 | } 425 | 426 | /no active mgr/ { 427 | MYDTSTAMP=mydtstamp($1" "$2) 428 | MYEVENT="MGR: None Active" 429 | histoevent(MYDTSTAMP,MYEVENT,"inc") 430 | histototal(MYEVENT,1) 431 | } 432 | 433 | / calling new monitor election$/ { 434 | MYDTSTAMP=mydtstamp($1" "$2) 435 | MYEVENT="MON: Calling Election" 436 | histoevent(MYDTSTAMP,MYEVENT,"inc") 437 | histototal(MYEVENT,1) 438 | } 439 | 440 | / failed .*report.*from / { 441 | MYDTSTAMP=mydtstamp($1" "$2) 442 | MYEVENT="OSD Down: Reported Failed" 443 | histoevent(MYDTSTAMP,MYEVENT,"inc") 444 | histototal(MYEVENT,1) 445 | if($9 ~ /^osd\./) 446 | osdpos=9 447 | if($10 ~ /^osd\./) 448 | osdpos=10 449 | osdevent($osdpos,MYEVENT,"inc") 450 | osdtotal(MYEVENT,1) 451 | } 452 | 453 | / marked down after no pg stats for / { 454 | MYDTSTAMP=mydtstamp($1" "$2) 455 | MYEVENT="OSD Down: No PG stats" 456 | histoevent(MYDTSTAMP,MYEVENT,"inc") 457 | histototal(MYEVENT,1) 458 | osdevent($9,MYEVENT,"inc") 459 | osdtotal(MYEVENT,1) 460 | } 461 | 462 | / boot$/ { 463 | MYDTSTAMP=mydtstamp($1" "$2) 464 | MYEVENT="OSD Boot" 465 | histoevent(MYDTSTAMP,MYEVENT,"inc") 466 | histototal(MYEVENT,1) 467 | osdevent($10,MYEVENT,"inc") 468 | osdtotal(MYEVENT,1) 469 | } 470 | 471 | END { 472 | 473 | ## Begin outputting the histogram chart 474 | printf("DateTime") 475 | n=asorti(EVENTHEADERS) 476 | if(osdhisto!="") 477 | osdn=asorti(OSDEVENTHEADERS) 478 | for (i = 1; i<= n; i++ ) 479 | printf(",%s",EVENTHEADERS[i]) 480 | if(osdhisto!="") { 481 | for (i = 1; i<= osdn; i++) 482 | printf(",%s",OSDEVENTHEADERS[i]) 483 | } 484 | 485 | printf("\n") 486 | 487 | dtcount=asorti(EVENTCOUNT,DTS) 488 | 489 | for (dtindex =1; dtindex <= dtcount; dtindex++) { 490 | DT=DTS[dtindex] 491 | printf("%s:00", DT) 492 | for (i = 1; i<= n; i++ ) 493 | printf(",%s",EVENTCOUNT[DT][EVENTHEADERS[i]]) 494 | if(osdhisto!="") { 495 | # add-on the per OSD histo columns 496 | for (i = 1; i<= osdn; i++ ) 497 | printf(",%s",OSDEVENTCOUNT[DT][OSDEVENTHEADERS[i]]) 498 | } 499 | printf("\n") 500 | } 501 | 502 | ## Begin outputting the column totals line 503 | printf("Totals") 504 | for (i = 1; i<= n; i++ ) 505 | printf(",%s",EVENTTOTAL[EVENTHEADERS[i]]) 506 | if(osdhisto!="") { 507 | for (i = 1; i<= osdn; i++ ) 508 | printf(",%s",OSDEVENTTOTAL[OSDEVENTHEADERS[i]]) 509 | } 510 | 511 | printf("\n") 512 | printf("\n") 513 | 514 | ## Begin outputting the OSD chart 515 | o=asorti(OSDHEADERS,OHDR) 516 | 517 | if(osdtree != "") { 518 | printf("OSD Tree Path,") 519 | for(pathindex=2;pathindex<=maxpathdepth;pathindex++) 520 | printf(",") 521 | } 522 | 523 | printf("osd.id") 524 | for (i = 1; i<= o; i++ ) { 525 | printf(",%s",OHDR[i]) 526 | } 527 | printf("\n") 528 | 529 | if(osdtree=="") { 530 | for (OSD in OSDEVENT) { 531 | gsub(/^osd\./,"",OSD) 532 | OSDS[OSD]=OSD 533 | } 534 | osdcount=asort(OSDS) 535 | } else { 536 | osdcount=asorti(osdpathsbypath,OSDS) 537 | } 538 | for (osdindex=1; osdindex<=osdcount; osdindex++) { 539 | if(osdtree=="") 540 | osd="osd."OSDS[osdindex] 541 | else { 542 | osd=OSDS[osdindex] 543 | split(OSDS[osdindex],osdparts,",") 544 | osd=osdparts[length(osdparts)] 545 | 546 | printf("%s,",osdpaths[osd]) 547 | split(osdpaths[osd],pathjunk,",") 548 | pathdepth=length(pathjunk) 549 | if(pathdepth=1;bindex--) { 592 | printf("%s,",BKS[bindex]) 593 | split(BKS[bindex],bucketjunk,",") 594 | junklen=length(bucketjunk) 595 | for(i=junklen; i< maxpathdepth; i++) 596 | printf(",") 597 | for (i = 1; i<= o; i++ ) { 598 | if(BUCKETSUMMARY[BKS[bindex]][OHDR[i]]>0) 599 | printf(",%s",BUCKETSUMMARY[BKS[bindex]][OHDR[i]]) 600 | else 601 | printf(",") 602 | } 603 | printf("\n") 604 | } 605 | } else { 606 | ## Or print column totals if Bucket Summary is not selected 607 | printf("Totals") 608 | if(osdtree != "") { 609 | for(pathindex=2;pathindex<=maxpathdepth;pathindex++) 610 | printf(",") 611 | } 612 | for (i = 1; i<= o; i++ ) { 613 | printf(",%s",OSDTOTAL[OHDR[i]]) 614 | } 615 | } 616 | 617 | printf("\n\n") 618 | 619 | ## Begin outputting the Pool summary chart 620 | if ("Deep-Scrub: Count" in POOLHEADERS) { 621 | POOLHEADERS["Deep-Scrub: Average"]=1 622 | } 623 | poolcount=asorti(POOLEVENT,poolids) 624 | phdrcount=asorti(POOLHEADERS,PHDR) 625 | printf("Pool ID") 626 | for(phdrindex=1;phdrindex<=phdrcount;phdrindex++) 627 | printf(",%s",PHDR[phdrindex]) 628 | printf("\n") 629 | for(pindex=1;pindex<=poolcount;pindex++) { 630 | printf("%s",poolids[pindex]) 631 | for(phdrindex=1;phdrindex<=phdrcount;phdrindex++) { 632 | if(PHDR[phdrindex]=="Deep-Scrub: Average") { 633 | if(POOLEVENT[poolids[pindex]]["Deep-Scrub: Count"]) 634 | printf(",%0.6f",POOLEVENT[poolids[pindex]]["Deep-Scrub: Total"]/POOLEVENT[poolids[pindex]]["Deep-Scrub: Count"]) 635 | else 636 | printf(",") 637 | } else 638 | printf(",%s",POOLEVENT[poolids[pindex]][PHDR[phdrindex]]) 639 | } 640 | printf("\n") 641 | } 642 | } 643 | 644 | 645 | -------------------------------------------------------------------------------- /compacting_timing.awk: -------------------------------------------------------------------------------- 1 | #!/usr/bin/awk -f 2 | 3 | ## 4 | ## Provide an OSD log for timing output of each leveldb Compaction event 5 | ## ./compacting_timing.awk ceph-osd.10.log 6 | ## 7 | ## 8 | 9 | BEGIN { 10 | begtime=0 11 | endtime=0 12 | } 13 | /leveldb: Compact/ { 14 | MYLINE=$0 15 | gsub(/[-:]/," ",$1) 16 | gsub(/[-:]/," ",$2) 17 | MYTIME=mktime($1" "$2) 18 | split($2,secs,".") 19 | millisecs=sprintf("0.%s",secs[2]) 20 | MYTIME+=millisecs 21 | 22 | if(begtime==0) { 23 | begtime=MYTIME 24 | } 25 | if(MYTIME>endtime) { 26 | endtime=MYTIME 27 | } 28 | if($6=="Compacting") { 29 | MYSTART=MYTIME 30 | next 31 | } 32 | 33 | if(MYSTART!="") { 34 | mydiff=MYTIME-MYSTART 35 | if(mydiffmymax || mymin=="") { 40 | mymaxreq=MYLINE 41 | mymax=mydiff 42 | } 43 | mysum+=mydiff 44 | mycount++ 45 | printf("%s,%s\n", mydiff, MYLINE) 46 | MYSTART="" 47 | } 48 | } 49 | END { 50 | if(mycount=="") 51 | mycount=1 52 | printf("Min,Avg,Max,Total Time Spent,%Time spent in compaction\n%s,%s,%s,%s,%s\nMin Req: %s\nMax Req: %s\n",mymin,mysum/mycount,mymax,mysum,mysum/(endtime-begtime)*100,myminreq,mymaxreq) 53 | } 54 | -------------------------------------------------------------------------------- /deep-scrub_timing.awk: -------------------------------------------------------------------------------- 1 | #!/usr/bin/awk -f 2 | 3 | ### 4 | # 5 | # Pipe a 'ceph.log' file into the script, redirect the output to a .csv file 6 | # 7 | # cat ceph.log | deep-scrub_histo.awk > deep-scrub_histo.csv 8 | # 9 | # Added ability to map acting OSDs to the PG scrub line 10 | # 11 | # Example: 12 | # cat ceph.log | deep-scrub_histo.awk -v pgdump=/path/to/pgdump > deep-scrub_histo.csv 13 | # 14 | ### 15 | 16 | function safediv(a,b) { 17 | if(b==0) { 18 | return 0 19 | } else { 20 | return a/b 21 | } 22 | } 23 | 24 | BEGIN { 25 | if(pgdump != "") { 26 | while(( getline line 0) { 27 | split(line,a," ") 28 | if(a[1] ~ /[0-9]*\.[0-9a-f]*/) 29 | gsub(/[\[\]]/, "", a[15]) 30 | gsub(/,/, ",osd.", a[15]) 31 | PGsToOSD[a[1]]="osd."a[15] 32 | } 33 | } 34 | } 35 | 36 | /deep-scrub/ { 37 | MYLINE=$0 38 | MYPG=$9 39 | gsub(/[-:]/," ",$1) 40 | gsub(/[-:]/," ",$2) 41 | MYTIME=mktime($1" "$2) 42 | split($2,secs,".") 43 | millisecs=sprintf("0.%s",secs[2]) 44 | MYTIME+=millisecs 45 | 46 | if($NF=="starts") { 47 | MYSTART[MYPG]=MYTIME 48 | next 49 | } 50 | 51 | if(MYSTART[MYPG]!="") { 52 | mydiff=MYTIME-MYSTART[MYPG] 53 | if(mydiffmymax || mymin=="") { 58 | mymaxreq=MYLINE 59 | mymax=mydiff 60 | } 61 | mysum+=mydiff 62 | mycount++ 63 | printf("%s,%s,%s\n", mydiff,PGsToOSD[MYPG],MYLINE) 64 | } 65 | } 66 | END { 67 | printf("Min,Avg,Max\n%s,%s,%s\nMin Req: %s\nMax Req: %s\n",mymin,safediv(mysum,mycount),mymax,myminreq,mymaxreq) 68 | } 69 | -------------------------------------------------------------------------------- /images/Histogram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linuxkidd/ceph-log-parsers/76697e2c0cb080217c2a93fda7de4e150e618e51/images/Histogram.png -------------------------------------------------------------------------------- /images/OSDs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linuxkidd/ceph-log-parsers/76697e2c0cb080217c2a93fda7de4e150e618e51/images/OSDs.png -------------------------------------------------------------------------------- /images/Pools.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linuxkidd/ceph-log-parsers/76697e2c0cb080217c2a93fda7de4e150e618e51/images/Pools.png -------------------------------------------------------------------------------- /iops_histo.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ -z $1 ] ; then 4 | echo 5 | echo ' Usage:' 6 | echo 7 | echo " $(basename $0) {ceph.log}" 8 | echo 9 | exit 1 10 | fi 11 | 12 | echo xThousand,Count 13 | 14 | grep pgmap $1 | awk -F\; '{split($3,a," "); print int(a[7]/1000) }' | sort -n | grep . | uniq -c | awk '{print $2","$1}' 15 | -------------------------------------------------------------------------------- /map_reporters_to_buckets.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ -z $2 ]; then 4 | echo 5 | echo "Usage: $(basename $0) " 6 | echo 7 | exit 1 8 | fi 9 | 10 | if [ ! -e $1 ]; then 11 | echo "File $1 does not exist." 12 | fi 13 | 14 | if [ ! -e $2 ]; then 15 | echo "File $2 does not exist." 16 | fi 17 | 18 | counters="f b" 19 | 20 | declare -A bucket_counts 21 | declare -A local_count 22 | tmpfile=$(mktemp) 23 | 24 | echo -n "Searching..." >&2 25 | awk '/reported failed/ { printf("f,%s\nb,%s\n",$9,$14)}' $1 | sort -n | uniq -c > $tmpfile 26 | echo -n ", mapping to buckets" >&2 27 | 28 | buckets=() 29 | currbuckets=() 30 | inhost=0 31 | depth_count=0 32 | 33 | echo "buckets...,reported,reporter" 34 | 35 | while read line; do 36 | thirdcol=$(echo $line | awk '{print $3}' | sed -e 's/[^0-9a-zA-Z_]/_/g') 37 | forthcol=$(echo $line | awk '{print $4}') 38 | if [ $(echo $thirdcol | grep -ic "^type$") -gt 0 ]; then 39 | continue 40 | fi 41 | if [ $(echo $thirdcol | grep -c ^osd\.) -gt 0 ]; then 42 | 43 | 44 | for j in $counters; do 45 | local_count[$j]=$(awk -v p="${j},${thirdcol}" '{if ( $2 == p ) { print $1 }}' $tmpfile) 46 | if [ -z ${local_count[$j]} ]; then 47 | local_count[$j]=0 48 | fi 49 | done 50 | 51 | for ((i = 0; i < ${#buckets[*]} ; i++)) { 52 | bucket=${buckets[$i]} 53 | for j in $counters; do 54 | ((bucket_counts[$j,$i]+=${local_count[$j]})) 55 | done 56 | echo -n "${!bucket}," 57 | } 58 | echo -n "$thirdcol" 59 | for j in $counters; do 60 | ((bucket_counts[$j,$i]+=${local_count[$j]})) 61 | echo -n ,${local_count[$j]} 62 | done 63 | echo 64 | else 65 | havebucket=-1 66 | for ((i = 0; i < ${#buckets[*]} ; i++)) { 67 | if [ ${buckets[$i]} == $thirdcol ]; then 68 | havebucket=$i 69 | fi 70 | } 71 | if [ $havebucket -eq -1 ]; then 72 | buckets+=($thirdcol) 73 | ((i++)) 74 | for j in $counters; do 75 | bucket_counts[$j,$i]=0 76 | done 77 | else 78 | highest_bucket=${#buckets[*]} 79 | for ((k = $highest_bucket; k > $havebucket; k--)) { 80 | for ((i = 0; i < $k ; i++)); do 81 | bucket=${buckets[$i]} 82 | echo -n "${!bucket}," 83 | done 84 | for j in $counters; do 85 | echo -n ${bucket_counts[$j,$k]}, 86 | bucket_counts[$j,$k]=0 87 | done 88 | echo 89 | if [ $k -gt $(($havebucket+1)) ]; then 90 | unset buckets[${#buckets[*]}-1] 91 | fi 92 | } 93 | fi 94 | declare "${thirdcol}=${forthcol}" 95 | fi 96 | done < $2 97 | highest_bucket=${#buckets[*]} 98 | for ((k = $highest_bucket; k > 0; k--)) { 99 | for ((i = 0; i < $k ; i++)); do 100 | bucket=${buckets[$i]} 101 | echo -n "${!bucket}," 102 | done 103 | for j in $counters; do 104 | echo -n ${bucket_counts[$j,$k]}, 105 | bucket_counts[$j,$k]=0 106 | rm -f $files[$j] 107 | done 108 | echo 109 | } 110 | echo >&2 111 | -------------------------------------------------------------------------------- /rgw_proc_time.awk: -------------------------------------------------------------------------------- 1 | #!/usr/bin/awk -f 2 | BEGIN { 3 | MYMONTHS["Jan"]=1 4 | MYMONTHS["Feb"]=2 5 | MYMONTHS["Mar"]=3 6 | MYMONTHS["Apr"]=4 7 | MYMONTHS["May"]=5 8 | MYMONTHS["Jun"]=6 9 | MYMONTHS["Jul"]=7 10 | MYMONTHS["Aug"]=8 11 | MYMONTHS["Sep"]=9 12 | MYMONTHS["Oct"]=10 13 | MYMONTHS["Nov"]=11 14 | MYMONTHS["Dec"]=12 15 | } 16 | 17 | { 18 | gsub(/[-:]/," ",$1) 19 | gsub(/[-:]/," ",$2) 20 | ENDTIME=mktime($1" "$2) 21 | split($2,secs,".") 22 | millisecs=sprintf("0.%s",secs[2]) 23 | ENDTIME+=millisecs 24 | 25 | sub(/^./,"",$10) 26 | gsub(/[\/\-:]/," ",$10) 27 | maxb=split($10,b," ") 28 | b[2]=sprintf("%02d",MYMONTHS[b[2]]) 29 | STARTTIMESTRING=b[3]" "b[2]" "b[1]" "b[4]" "b[5]" "b[6] 30 | STARTTIME=mktime(STARTTIMESTRING) 31 | delta=ENDTIME-STARTTIME 32 | print $1" "$2" ("ENDTIME") -"STARTTIMESTRING" ("STARTTIME") :: "delta 33 | } 34 | -------------------------------------------------------------------------------- /rgw_req_timing.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ -z $1 ]; then 4 | echo 5 | echo "Usage: $(basename $0) " 6 | echo 7 | exit 1 8 | fi 9 | 10 | if [ ! -e $1 ]; then 11 | echo "File $1 does not exist." 12 | fi 13 | 14 | awk ' 15 | BEGIN { 16 | mymin=999 17 | mymax=0 18 | mysum=0 19 | mycount=0 20 | } 21 | /req=/ { 22 | MYLINE=$0 23 | gsub(/[-:]/," ",$1) 24 | gsub(/[-:]/," ",$2) 25 | MYTIME=mktime($1" "$2) 26 | split($2,secs,".") 27 | millisecs=sprintf("0.%s",secs[2]) 28 | MYTIME+=millisecs 29 | if(match(MYLINE,/starting new request/)) { 30 | MYREQ=$9 31 | MYSTART[MYREQ]=MYTIME 32 | } 33 | if(match(MYLINE,/req done/)) { 34 | MYREQ=$8 35 | if(MYSTART[MYREQ]!="") { 36 | mydiff=MYTIME-MYSTART[MYREQ] 37 | if(mydiffmymax) { 42 | mymaxreq=MYLINE 43 | mymax=mydiff 44 | } 45 | mysum+=mydiff 46 | mycount++ 47 | printf("%s,%s\n", mydiff, MYLINE) 48 | } 49 | } 50 | } 51 | END { 52 | printf("Min,Avg,Max\n%s,%s,%s\nMin Req: %s\nMax Req: %s\n",mymin,mysum/mycount,mymax,myminreq,mymaxreq) 53 | } 54 | ' $1 55 | -------------------------------------------------------------------------------- /slow_by_osd-pool-type.awk: -------------------------------------------------------------------------------- 1 | #!/usr/bin/awk -f 2 | # 3 | # by: Michael Kidd 4 | # https://github.com/linuxkidd 5 | # 6 | # Use: 7 | # chmod 755 slow_by_osd-pool-type.awk 8 | # ./slow_by_osd-pool-type.awk ceph.log 9 | # 10 | # Output Options: 11 | # -v csv=1 12 | # -v pivot=1 13 | # -v perline=1 14 | # 15 | # Note: with no '-v' option specified, it provides 'visual' output for easy human parsing 16 | # Note2: Only one output option may be used per execution 17 | # 18 | # 19 | 20 | BEGIN { 21 | PROCINFO["sorted_in"] = "@val_num_asc" 22 | } 23 | 24 | /slow request [3-5][0-9]\./ { 25 | if($20 ~ /^[0-9]*\.[0-9a-fs]*$/) { 26 | split($20,a,".") 27 | b=$0 28 | gsub(/^.*currently /,"",b) 29 | gsub(/ from .*/, "", b) 30 | slowtype[b]++ 31 | slowosd[$3]++ 32 | slowosdbytype[$3][b]++ 33 | slowbypool[a[1]]++ 34 | slowbypooltype[a[1]][b]++ 35 | slowpoolosdtype[a[1]][$3][b]++ 36 | slowtypepools[b][a[1]]++ 37 | } 38 | } 39 | 40 | function printVisual() { 41 | print "Pool stats: " 42 | for(p in slowbypool) { 43 | print "Pool id: "p" Total slow: "slowbypool[p] 44 | for (t in slowbypooltype[p]) { 45 | print "\t"slowbypooltype[p][t]"\t"t 46 | } 47 | } 48 | print "" 49 | print "" 50 | print "OSD Stats: " 51 | for (o in slowosd) { 52 | print "\t"o" "slowosd[o] 53 | for (t in slowosdbytype[o]) { 54 | print "\t\t"slowosdbytype[o][t]" "t 55 | } 56 | } 57 | print "" 58 | print "" 59 | print "Slow by Type: " 60 | for (t in slowtype) { 61 | print "\t"slowtype[t]" "t 62 | } 63 | } 64 | 65 | function printCSV() { 66 | printf("Pool,") 67 | for(t in slowtype) { 68 | printf("%s,",t) 69 | } 70 | print "" 71 | for(p in slowbypool) { 72 | printf("%s,",p) 73 | for (t in slowtype) { 74 | printf("%d,",slowbypooltype[p][t]) 75 | } 76 | print "" 77 | } 78 | printf("Total:,") 79 | for (t in slowtype) { 80 | printf("%d,",slowtype[t]) 81 | } 82 | print "" 83 | print "" 84 | printf("OSD,") 85 | for(t in slowtype) { 86 | printf("%s,",t) 87 | } 88 | print "" 89 | for (o in slowosd) { 90 | printf("%s,",o) 91 | for (t in slowtype) { 92 | printf("%s,",slowosdbytype[o][t]) 93 | } 94 | print "" 95 | } 96 | printf("Total:,") 97 | for (t in slowtype) { 98 | printf("%d,",slowtype[t]) 99 | } 100 | print "" 101 | } 102 | 103 | function printPerLine() { 104 | print "Pool,OSD,Type,Count" 105 | for(p in slowpoolosdtype){ 106 | for(o in slowpoolosdtype[p]) { 107 | for(t in slowpoolosdtype[p][o]) 108 | print p","o","t","slowpoolosdtype[p][o][t] 109 | } 110 | } 111 | } 112 | 113 | function printPivot() { 114 | printf(",") 115 | for(t in slowtype) { 116 | printf("%s",t) 117 | for(p in slowtypepools[t]) { 118 | l2=l2","p 119 | ptotal=ptotal","slowtypepools[t][p] 120 | sumtotal+=slowtypepools[t][p] 121 | printf(",") 122 | } 123 | } 124 | print "Totals" 125 | printf("OSD / Pool ID%s\n",l2) 126 | for(o in slowosd) { 127 | printf("%s,",o) 128 | for(t in slowtype) { 129 | for(p in slowtypepools[t]) { 130 | if(slowpoolosdtype[p][o][t]>0) 131 | printf("%d,",slowpoolosdtype[p][o][t]) 132 | else 133 | printf(",") 134 | } 135 | } 136 | print slowosd[o] 137 | } 138 | print "Totals:"ptotal","sumtotal 139 | } 140 | 141 | END { 142 | if(csv==1) 143 | printCSV() 144 | else if(pivot==1) 145 | printPivot() 146 | else if(perline==1) 147 | printPerLine() 148 | else 149 | printVisual() 150 | } 151 | 152 | --------------------------------------------------------------------------------