├── .DS_Store ├── bin └── README ├── default ├── .DS_Store ├── app.conf ├── data │ ├── .DS_Store │ └── ui │ │ ├── .DS_Store │ │ ├── nav │ │ └── default.xml │ │ └── views │ │ ├── .DS_Store │ │ ├── README │ │ ├── bucket_roll_analysis.xml │ │ ├── cluster_master_performance.xml │ │ ├── crash_dump_analysis.xml │ │ ├── debug_bucket_rolls.xml │ │ ├── debug_cache_manager_misses.xml │ │ ├── debug_incoming_forwarders.xml │ │ ├── debug_indexer_performance.xml │ │ ├── debug_ingestion.xml │ │ ├── debug_peer_is_down.xml │ │ ├── debug_replication.xml │ │ ├── debug_search.xml │ │ ├── discovery_forwarding_hierrachy.xml │ │ ├── discovery_searches.xml │ │ ├── event_delay_for_host.xml │ │ ├── event_delay_for_index.xml │ │ ├── event_delay_index_sourcetype.xml │ │ ├── event_distribution_measurement.xml │ │ ├── find_cluster_master_events.xml │ │ ├── home.xml │ │ ├── indexer_performance.xml │ │ ├── internal_indexes_breakdown.xml │ │ ├── measuring_concurrency.xml │ │ ├── roll_your_own_tstats_acceleration.xml │ │ ├── search_head_resource_utilisation.xml │ │ ├── search_performance_evaluator.xml │ │ ├── trace_back_indexer_search_load.xml │ │ ├── tstats_performance_comparision.xml │ │ └── vcpu_infrastructure_sizing.xml └── searches.conf ├── local ├── app.conf ├── data │ └── ui │ │ └── views │ │ ├── bucket_size_analysis │ │ ├── bursting_forwarders_and_indexing_delay.xml │ │ ├── event_distribution_measurements.xml │ │ ├── intermediate_forwarders_switching_efficiency_analysis.xml │ │ └── top_data_generating_source_forwarder_analysis.xml └── savedsearches.conf ├── metadata ├── default.meta └── local.meta └── vcpu_pricing /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/silkyrich/cluster_health_tools/6678502bc293585094f98e1ba87a1d40a0b3b538/.DS_Store -------------------------------------------------------------------------------- /bin/README: -------------------------------------------------------------------------------- 1 | This is where you put any scripts you want to add to this app. 2 | -------------------------------------------------------------------------------- /default/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/silkyrich/cluster_health_tools/6678502bc293585094f98e1ba87a1d40a0b3b538/default/.DS_Store -------------------------------------------------------------------------------- /default/app.conf: -------------------------------------------------------------------------------- 1 | # 2 | # Splunk app configuration file 3 | # 4 | 5 | [install] 6 | is_configured = 0 7 | 8 | [ui] 9 | is_visible = 1 10 | label = Event distribution tools 11 | 12 | [launcher] 13 | author = Richard Morgan 14 | description = A collection of dashboards to measure event distribution by various metrics 15 | version = 1.0 16 | 17 | -------------------------------------------------------------------------------- /default/data/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/silkyrich/cluster_health_tools/6678502bc293585094f98e1ba87a1d40a0b3b538/default/data/.DS_Store -------------------------------------------------------------------------------- /default/data/ui/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/silkyrich/cluster_health_tools/6678502bc293585094f98e1ba87a1d40a0b3b538/default/data/ui/.DS_Store -------------------------------------------------------------------------------- /default/data/ui/nav/default.xml: -------------------------------------------------------------------------------- 1 | 8 | -------------------------------------------------------------------------------- /default/data/ui/views/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/silkyrich/cluster_health_tools/6678502bc293585094f98e1ba87a1d40a0b3b538/default/data/ui/views/.DS_Store -------------------------------------------------------------------------------- /default/data/ui/views/README: -------------------------------------------------------------------------------- 1 | There are a lot of dashboards in this collection of varying quality. 2 | 3 | Install the one called "home" into any app that you like, "search" for instance. 4 | 5 | The "home" dashboard has links to the major dashboards and a description of what they are trying to achieve. 6 | -------------------------------------------------------------------------------- /default/data/ui/views/crash_dump_analysis.xml: -------------------------------------------------------------------------------- 1 |
2 | 3 | Another groovy debug dashboard from Richard Morgan productions 4 | 5 | (index=_audit) 6 | (index=_internal OR index=core_splunk_internal) 7 | (index=_internal OR index=core_splunk_internal) (sourcetype=metrics OR sourcetype=splunkd) METRICS 8 | index=_introspection 9 | 10 | 11 | $splunkd$ $filter_host$ sourcetype=splunkd_crash_log TERM(build) TERM(Received) TERM(fatal) TERM(signal) TERM(splunkd) 12 | | rex field=_raw "^\[build (?<build>[0-9a-f]{12})\]" 13 | | rex field=_raw "Received fatal signal (?<received_fatal_signal>\d)(\s(?<signal_description>[^.]+))?" 14 | | rex field=_raw "Crashing thread: (?<crashing_thread>[^\s-]+)" 15 | | rex field=_raw max_match=100 "\n\s\s(?<back_trace>\[[^]]+\]\s(?<function_call>[^\s]+).*)" 16 | | rex field=_raw "Last errno: (?<last_errno>\d+)" 17 | | eval crash=case( 18 | searchmatch("TERM(build) TERM(Process) TERM(renamed) TERM(CLOCK_MONOTONIC)"),"search", 19 | searchmatch("TERM(build) NOT(TERM(Process) TERM(renamed) TERM(CLOCK_MONOTONIC))"),"mothership"), 20 | signature=md5(mvjoin(function_call,"")) 21 | | fields _time host build crash received_fatal_signal crashing_thread back_trace threads_running function_call signature signal_description 22 | | stats list(*) as * by _time host 23 | | foreach 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 24 | [| eval sub_stack=if(mvcount(function_call)>=<<FIELD>>, mvappend(sub_stack, mvindex(function_call,<<FIELD>>)),sub_stack), 25 | sig_stack=if(mvcount(function_call)>=<<FIELD>>, mvappend(sig_stack,md5(mvjoin(sub_stack,""))),sig_stack)] 26 | | fields - sub_stack back_trace 27 | | lookup build_version_map.csv build 28 | | eval build_version=if(isnull(version),build, build." (".version.")") 29 | $wait_for_lookup$ 30 | 31 | $time.earliest$ 32 | $time.latest$ 33 | 34 | 35 | | makeresults | eval build_map="6.0,182037|6.0.0.4,204109|6.0.1,189883|6.0.1.3,204108|6.0.10,272969|6.0.11,276357|6.0.12,277522|6.0.13,278006|6.0.14,278382|6.0.15,278774|6.0.2,196940|6.0.2.2,204107|6.0.3,204048|6.0.3,204106|6.0.5,214064|6.0.6,228831|6.0.7,241889|6.0.8,259987|6.0.9,267137|6.1.0,206881|6.1.1,207789|6.1.11,277527|6.1.12,278005|6.1.13,278305|6.1.14,278775|6.1.2,213098|6.1.3,220630|6.1.4,233537|6.1.5,239630|6.1.6,249101|6.1.7,260715|6.1.8,266909|6.1.9,272667|6.2.0,237341|6.2.0.1,237360|6.2.1,245427|6.2.1.1,252428|6.2.1.2,259063|6.2.1.3,262152|6.2.10,276849|6.2.11,277653|6.2.12,277845|6.2.13,278211|6.2.13.1,278423|6.2.14,278574|6.2.15,278770|6.2.2,255606|6.2.2.1,260295|6.2.3,264376|6.2.3.1,265391|6.2.3.1.1,271204|6.2.4,271043|6.2.4.1,272114|6.2.4.2,274692|6.2.5,272645|6.2.6,274160|6.2.7,275064|6.2.8,275559|6.2.9,276372|6.3.0,aa7d4b1ccb80|6.3.0.1,bac478facca4|6.3.0.2,8af456cb02af|6.3.0.3,a82aa90572fa|6.3.0.4,dc46d23c16ff|6.3.0.5,e4850de2317c|6.3.1,f3e41e4b37b2|6.3.1.0.1,9db85a215ec7|6.3.10,75de5c491bd1|6.3.11,d17c287b5612|6.3.12,911c7597cdd8|6.3.13,009ffcd0f855|6.3.1511.1,90ea9ab275dc|6.3.1511.2,989ef825729e|6.3.1511.3,40da1354c197|6.3.2,aaff59bb082c|6.3.2.1,cf197124fd48|6.3.2.2,b34133cb9679|6.3.3,f44afce176d0|6.3.3.0.1,8b48a3a5644b|6.3.3.2,1c1e99984d4c|6.3.3.3,9ee89d42abdd|6.3.3.4,bea15cb6e512|6.3.4,cae2458f4aef|6.3.5,8ef3e646d7b6|6.3.5.1,145a748c01fb|6.3.5.2,ffe4dc899759|6.3.6,1dc2bfe9d42|6.3.7,8bf976cd6a7c|6.3.8,1e8d95973e45|6.3.9,75b73647f7cd|6.4.0,f2c836328108|6.4.1,debde650d26e|6.4.1.1,926d5c41d8d1|6.4.1.1.1,377811cf5aef|6.4.1.1.2,ac60bacb82a8|6.4.1.2,00de1bf7ead6|6.4.1.2.1,7eedf7e1a8ff|6.4.1.3,ab4d3120c351|6.4.10,1c39464735cc|6.4.11,0691276baf18|6.4.2,00f5bb3fa822|6.4.3,b03109c2bad4|6.4.4,b53a5c14bb5e|6.4.4.1,01e053f0470e|6.4.5,e82289930bdd|6.4.6,6635aa31e851|6.4.7,cac34cc2bb32|6.4.8,5dedc6298537|6.4.9,493044ecc65a|6.5.0,59c8927def0f|6.5.1,f74036626f0c|6.5.1.1,39c7fd872a87|6.5.1.2,8a20d8d7400d|6.5.1.3,d7d84fa6a0bd|6.5.10,8114be174b06|6.5.1612,a8914247a786|6.5.2,67571ef4b87d|6.5.2.1,6ddbff6d9c2c|6.5.2.2,2c493b610abf|6.5.2.3,1ad764225ac0|6.5.2.4,fe64800a0fd4|6.5.3,36937ad027d4|6.5.3.1,bf0ff7c2ab8b|6.5.3.2,40075dab5beb|6.5.4,adb84211dd7c|6.5.4.1,d67480c977b1|6.5.5,586c3ec08cfb|6.5.5.1,3d35d7c516a2|6.5.6,44f873cfa227|6.5.7,f44cfc17f820|6.5.8,96271d9ba09a|6.5.9,eb980bc2467e|6.6.0,1c4f3bbe1aea|6.6.1,aeae3fe0c5af|6.6.10,2b5f6c3d5f96|6.6.11,a4e9ea700cba|6.6.12,ff1b28d42e4c|6.6.2,4b804538c686|6.6.3,e21ee54bc796|6.6.3.2,6d752d94c69f|6.6.3.3,6d5c511f21ab|6.6.3.4,61f4a4172dd0|6.6.3.5,3c9c61252428|6.6.3.6,8112d4c3bd5d|6.6.4,00895e76d346|6.6.4.1,0cdd444994f1|6.6.5,b119a2a8b0ad|6.6.6,ff5e72edc7c4|6.6.6.1,3726298c0da0|6.6.7,429660948eb8|6.6.8,6c27a8439c1e|6.6.9,7ca2e86659b7|7.0.0,c8a78efdd40f|7.0.0.1,baf324169244|7.0.1,2b5b15c4ee89|7.0.10,d8401e2713e7|7.0.11,ca372bdc34bc|7.0.11.1,890181452bae|7.0.12,2b7671dcb5e1|7.0.13,b6e41c05f519|7.0.2,03bbabbd5c0f|7.0.2.1,a52a53f7edc4|7.0.3,fa31da744b51|7.0.3.1,db5bd1668d94|7.0.3.2,5bdcd5729ae5|7.0.3.3,268bf7b2599f|7.0.3.4,26590c68a494|7.0.3.5,89a1575a5c0f|7.0.3.6,c7bfc947bd0b|7.0.3.7,6169add9bd45|7.0.3.8,637dc937bd32|7.0.4,68ba48f99743|7.0.4.1,9ae125c3d908|7.0.4.2,a162f38e0dbf|7.0.5,23d8175df399|7.0.5.1,2aba2d3d71bc|7.0.5.2,e488e1389aaf|7.0.5.3,8f93f8750357|7.0.5.4,0700a78c551d|7.0.5.5,02b315c1c42f|7.0.5.6,1b0251355513|7.0.5.7,3c8832c91b7d|7.0.6,3e6d6611992a|7.0.7,b803471b1c68|7.0.8,b1976516a355|7.0.8.1,30deaae500d1|7.0.8.2,c46937989eff|7.0.8.3,9394e0aeb257|7.0.8.4,ae3b0d980261|7.0.8.5,c3e02dedf40a|7.0.8.7,9bd1046c2cc1|7.0.9,12f0d9382e96|7.0.9.1,05c842e8a806|7.1.2,a0c72a66db66|7.1.2.1,b74cd94fb801|7.1.3,51d9cac7b837|7.1.3.1,1f6ea26f4030|7.1.3.1.1,8c830b55aa74|7.1.3.3,b219f32f0f0f|7.1.3.4,c68e68e6c8bd|7.1.3.6,062d1f3ce951|7.1.4,5a7a840afcb3|7.1.4.1,102891c799cd|7.1.5,fd4da3d4caf1|7.1.6,8f009a3f5353|7.1.6.1,3a8f19971411|7.1.6.2,2f026e4e011c|7.1.7,39ea4c097c30|7.1.8,3856f9bb4747|7.1.9,45b25e1f9be3|7.2.0,8c86330ac18|7.2.1,be11b2c46e23|7.2.1.1,2d6acd5fa26e|7.2.1.2,f89ad7279024|7.2.1.3,06eae1bf22c3|7.2.2,48f4f251be37|7.2.3,06d57c595b80|7.2.3.1,402d27e7f7db|7.2.4,8a94541dcfac|7.2.4.1,16c50c8a1b70|7.2.4.2,fb30470262e3|7.2.5,088f49762779|7.2.5.1,962d9a8e1586|7.2.6,c0bf0f679ce9|7.2.7,f817a93effc2|7.2.7.1,0bc984fe4eeb|7.2.7.3,2bfb24ae5e79|7.2.7.4,578dd6e8db4e|7.2.8,d613a50d43ac|7.2.8.1,fb2982d8fde7|7.2.9,2dc56eaf3546|7.3.0,657388c7a488|7.3.1,bd63e13aa157|7.3.1.1,8e225e1518e2|7.3.2,c60db69f8e32|7.3.3,7af3758d0d5e|8.0.0,1357bef0a7f6" 36 | | eval pair=split(build_map,"|") 37 | | fields - build_map _time 38 | | mvexpand pair 39 | | eval pair=split(pair,","), version=mvindex(pair,0), build=mvindex(pair,1) 40 | | fields - pair 41 | | outputlookup build_version_map.csv 42 | 43 | -24h@h 44 | now 45 | 46 | | noop 47 | 48 | 49 |
50 | 51 | 52 | 53 | -24h@h 54 | now 55 | 56 | 57 | 58 | if(isnull($time.latest$),31, if((round(relative_time(now(), $time.latest$)-relative_time(now(), $time.earliest$))/$selected_bins$)<31,31,round(((relative_time(now(), $time.latest$)-relative_time(now(), $time.earliest$))/$selected_bins$)/31)*31)) 59 | if(isnull($time.latest$),10,if((round(relative_time(now(), $time.latest$)-relative_time(now(), $time.earliest$))/$selected_bins$)<10,10,round(((relative_time(now(), $time.latest$)-relative_time(now(), $time.earliest$))/$selected_bins$)/10)*10)) 60 | 61 | 0 62 | now() 63 | 64 | 65 | 66 | 67 | 68 | 100 69 | 250 70 | 500 71 | 750 72 | 1000 73 | 500 74 | 75 | 76 | 77 | span= 78 | sec 79 | 10 80 | 81 | 82 | 83 | span= 84 | sec 85 | 31 86 | 87 | 88 | 89 | * 90 | 91 |
92 | 93 | 94 | Crashes over time by build and version 95 | 96 | 97 | 98 | | timechart $selected_metrics_span$ count by build_version 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | Crashes by build and version 110 | 111 | 112 | | chart count by build_version 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | Crashes by fatal signal 123 | 124 | 125 | | chart count by signal_description 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | Crashes by thread 138 | 139 | 140 | | stats count by crashing_thread 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | Crashes by signatures 151 | 152 | 153 | | stats count by signature 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | Select crashing server to drill down to review crashes 165 | 166 | 167 | * 168 | label 169 | build_version 170 | 171 | | chart count by build_version 172 | | eval label=build_version." (".count.")" 173 | 174 | * 175 | 176 | 177 | 178 | * 179 | label 180 | signature 181 | 182 | | stats count by signature 183 | | eval label=signature." (".count.")" 184 | 185 | * 186 | 187 | 188 | 189 | * 190 | label 191 | crashing_thread 192 | 193 | | stats count by crashing_thread 194 | | eval label=crashing_thread." (".count.")" 195 | 196 | * 197 | 198 | 199 | 200 | * 201 | label 202 | received_fatal_signal 203 | 204 | | stats count by received_fatal_signal 205 | | eval label=received_fatal_signal." (".count.")" 206 | 207 | * 208 | 209 | 210 | 211 | | fillnull crashing_thread value="" 212 | | fillnull signature value="" 213 | | fillnull crash value="" 214 | | search signature="$selected_signature$" build_version="$selected_build_version$" crashing_thread="$selected_thread$" received_fatal_signal="$selected_signal$" 215 | | stats count by host version crash crashing_thread 216 | 217 | 218 | 219 | 220 | 221 | $row.host$ 222 | 223 |
224 |
225 |
226 | 227 | 228 | 229 | Here is the script required to extract the builds to version mapping. 230 | 231 | for v in $(curl -s http://releases.splunk.com/released_builds/ | perl -ne 'print "$1\n" if /href="([678](?:\.\d+)+)\/"/'); do for base in debug splunk/windows; do curl -s http://releases.splunk.com/released_builds/$v/$base/ | perl -ne 'print "$1,$2,$3\n" if /href="splunkforwarder-unstripped-((?:\d+\.)+\d+)-([0-9a-fA-F]+)-([^\.]+)\.tgz"/'; done; done 232 | 233 | 234 | 235 | 236 | 237 | Drill down on crashes for $selected_host$ 238 | 239 | Drill down to see splunkd.log 1min before crash 240 | 241 | | search host="$selected_host$" 242 | | eval _real_time=_time 243 | 244 | 245 | 246 | 247 | $row._real_time$-60 248 | search?q=$splunkd$ sourcetype=splunkd host=$row.host$ earliest=$drilldown_time_earliest$ latest=$row._real_time$&amp;earliest=$drilldown_time_earliest$&amp;latest=$row._real_time$ 249 | 250 |
251 |
252 |
253 |
254 | -------------------------------------------------------------------------------- /default/data/ui/views/debug_bucket_rolls.xml: -------------------------------------------------------------------------------- 1 |
2 | 3 | 62 4 | (index=core_splunk_internal OR index=_internal) 5 | 6 | 7 | 8 | $_internal$ $selected_targets$ source=*splunkd.log sourcetype=splunkd component=HotBucketRoller TERM(caller=lru) 9 | | rex field=to "db_(?<to_time>[0-9]{10})_(?<from_time>[0-9]{10})" 10 | | eval size_mb=size/(1024*1024) 11 | | eval duration=to_time-from_time 12 | | table _time host idx bid duration from to size_mb caller to_time from_time 13 | $time.earliest$ 14 | $time.latest$ 15 | 1 16 | 17 |
18 | 19 | 20 | label 21 | search 22 | 23 | $_internal$ sourcetype=splunkd CMMaster status=success site* earliest=-4hr latest=now 24 | | rex field=_raw max_match=64 "(?<site_pair>site\d+,\"?[^\",]+)" 25 | | eval cluster_master=host 26 | | fields + site_pair cluster_master 27 | | fields - _* 28 | | dedup site_pair 29 | | mvexpand site_pair 30 | | dedup site_pair 31 | | rex field=site_pair "^(?<site_id>site\d+),\"?(?<indexer>.*)" 32 | | rex field=cluster_master "^(?<short_name_cm>[^\.]+)" 33 | | eval search="host=".indexer, host_count=1 34 | | appendpipe 35 | [| stats values(indexer) as indexers by site_id short_name_cm 36 | | eval host_count=mvcount(indexers), 37 | search="host IN (".mvjoin(mvfilter(indexers!=""), ", ").")" 38 | | eval label=site_id." (".host_count." idxs @ ".short_name_cm 39 | ] 40 | | appendpipe 41 | [| stats values(indexer) as indexers dc(site_id) as site_count by short_name_cm 42 | | eval host_count=mvcount(indexers), 43 | search="host IN (".mvjoin(mvfilter(indexers!=""), ", ").")" 44 | | eval label=short_name_cm." (".host_count." idx ".site_count." sites)" ] 45 | | rex field=indexer "^(?<short_name_idx>[^\.]+)" 46 | | eval label=if(isnull(label), short_name_idx." (".site_id."@".short_name_cm.")", label) 47 | | stats max(host_count) as count by label search 48 | | sort 0 - count 49 | -24h@h 50 | now 51 | 52 | 53 | 54 | 55 | $selected_indexers$ 56 | 57 | 58 | None 59 | None 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | -7d@h 68 | now 69 | 70 | 71 | if((round(relative_time(now(), $time.latest$)-relative_time(now(), $time.earliest$))/$time_resolution$)<31,31,round((relative_time(now(), $time.latest$)-relative_time(now(), $time.earliest$))/$time_resolution$)) 72 | 73 | 74 | 75 | 76 | Crude 77 | Low 78 | Medium 79 | High 80 | Ultra 81 | 500 82 | 83 | if((round(relative_time(now(), $time.latest$)-relative_time(now(), $time.earliest$))/$time_resolution$)<31,31,round((relative_time(now(), $time.latest$)-relative_time(now(), $time.earliest$))/$time_resolution$)) 84 | 85 | 86 |
87 | 88 | 89 | 90 | 91 | | eval bin_size=size_mb 92 | | bin bin_size span=log2 93 | | sort + bin_size 94 | | chart limit=100 count by bin_size idx 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | | noop 109 | 110 | 111 |
112 |
113 |
114 |
115 | -------------------------------------------------------------------------------- /default/data/ui/views/debug_cache_manager_misses.xml: -------------------------------------------------------------------------------- 1 |
2 | 3 | 4 | 5 | round($job.runDuration$,2) 6 | 7 | index=_internal group=cachemgr_bucket 8 | | bin _time span=$selected_span$sec 9 | | stats sum(cache_miss) as cache_miss by host _time 10 | 11 | $time.earliest$ 12 | $time.latest$ 13 | 14 | 15 | 61 16 | 00:01:02 17 | 18 |
19 | 20 | 21 | 22 | -4h@m 23 | now 24 | 25 | 26 | 27 | 28 | relative_time(now(), $time.latest$)-relative_time(now(), $time.earliest$) 29 | ceiling($duration_seconds$/300/31)*31 30 | 31 | 32 | 33 | 34 | 35 | 36 | 62 37 | 38 | tostring($form.selected_span$,"duration") 39 | 40 | 41 | 42 | 43 | 00:01:02 44 | 45 |
46 | 47 | 48 | Cache manager misses per host, select time range + host for drill down 49 | 50 | 51 | 52 | $job.runDuration$ 53 | 54 | | xyseries _time host cache_miss 55 | 56 | 57 | 58 | 59 | 60 | 61 | "host=".$click.name2$ 62 | 63 | 64 | $start$ 65 | $end$ 66 | 67 | 68 | 69 | 70 | 71 | 72 | Searches executing on $selected_host$ 73 | 74 | Drill down to see details of search across all indexes 75 | 76 | 77 | $job.runDuration$ 78 | 79 | 80 | $job.runDuration$ 81 | 82 | index=_internal $selected_host$ sourcetype=splunkd_remote_searches StreamedSearch Streamed search connection terminated search_id=* 83 | 84 | | rex field=_raw "search_rawdata_bucketcache_error=(?<rawdata_bucketcache_error>[\d.]+)" 85 | | rex field=_raw "search_rawdata_bucketcache_miss=(?<rawdata_bucketcache_miss>[\d.]+)" 86 | | rex field=_raw "search_index_bucketcache_error=(?<index_bucketcache_error>[\d.]+)" 87 | | rex field=_raw "search_index_bucketcache_hit=(?<index_bucketcache_hit>[\d.]+)" 88 | | rex field=_raw "search_index_bucketcache_miss=(?<index_bucketcache_miss>[\d.]+)" 89 | | rex field=_raw "search_rawdata_bucketcache_hit=(?<rawdata_bucketcache_hit>[\d.]+)" 90 | | rex field=_raw "search_rawdata_bucketcache_miss_wait=(?<rawdata_bucketcache_miss_wait>[\d.]+)" 91 | | rex field=_raw "search_index_bucketcache_miss_wait=(?<index_bucketcache_miss_wait>[\d.]+)" 92 | | rex field=_raw "drop_count=(?<drop_count>[\d.]+)" 93 | | rex field=_raw "scan_count=(?<scan_count>[\d.]+)" 94 | | rex field=_raw "eliminated_buckets=(?<eliminated_buckets>[\d.]+)" 95 | | rex field=_raw "considered_events=(?<considered_events>[\d.]+)" 96 | | rex field=_raw "decompressed_slices=(?<decompressed_slices>[\d.]+)" 97 | | rex field=_raw "events_count=(?<events_count>[\d.]+)" 98 | | rex field=_raw "total_slices=(?<total_slices>[\d.]+)" 99 | | rex field=_raw "considered_buckets=(?<considered_buckets>[\d.]+)" 100 | | stats 101 | sum(rawdata_bucketcache_error) as search_rawdata_bucketcache_error_sum 102 | sum(rawdata_bucketcache_miss) as search_rawdata_bucketcache_miss_sum 103 | sum(index_bucketcache_error) as search_index_bucketcache_error_sum 104 | sum(index_bucketcache_hit) as search_index_bucketcache_hit_sum 105 | sum(index_bucketcache_miss) as search_index_bucketcache_miss_sum 106 | sum(rawdata_bucketcache_hit) as search_rawdata_bucketcache_hit_sum 107 | sum(rawdata_bucketcache_miss_wait) as search_rawdata_bucketcache_miss_wait_sum. 108 | sum(index_bucketcache_miss_wait) as search_index_bucketcache_miss_wait_sum. 109 | min(_time) as time_min 110 | max(_time) as time_max 111 | sum(drop_count) as drop_count_sum 112 | sum(scan_count) as scan_count_sum14746 113 | sum(eliminated_buckets) as eliminated_buckets_sum 114 | sum(considered_events) as considered_events_sum 115 | sum(decompressed_slices) as decompressed_slices_sum 116 | sum(events_count) as events_count_sum14746 117 | sum(total_slices) as total_slices_sum 118 | sum(considered_buckets) as considered_buckets_sum 119 | by search_id server 120 | | search search_id=remote_* 121 | | eval cache_misses=search_rawdata_bucketcache_miss_sum + search_index_bucketcache_miss_sum 122 | | sort - search_rawdata_bucketcache_miss_sum 123 | | table search_id cache_misses * 124 | | sort - cache_misses 125 | $selection_earliest$ 126 | $selection_latest$ 127 | 128 | 129 | 130 | 131 | search?q=index=_*%20$row.search_id=$&earliest=$selection_earliest$&latest=$selection_latest$ 132 | 133 |
134 |
135 |
136 |
137 | -------------------------------------------------------------------------------- /default/data/ui/views/debug_peer_is_down.xml: -------------------------------------------------------------------------------- 1 |
2 | 3 | Another groovy debug dashboard from Richard Morgan 4 | 5 | (index=_audit) 6 | (index=_internal OR index=core_splunk_internal) 7 | (index=_internal OR index=core_splunk_internal) (sourcetype=metrics OR sourcetype=splunkd) METRICS 8 | index=_introspection 9 | 10 | 11 | $splunkd$ sourcetype=splunkd DistributedPeerManager status=Down Unable to distribute to peer named 12 | | rex field=_raw "Unable to distribute to peer named (?<peer_name>[^\s]*) at uri=(?<peer_uri>[^:]+):" 13 | | eval peer_name=if(peer_name="",peer_uri, peer_name) 14 | | table _time host peer_name peer_uri 15 | $time.earliest$ 16 | $time.latest$ 17 | 18 | 19 | | stats count by host 20 | 21 | 22 | $audit$ host=$selected_host$ TERM(action=search) TERM(info=granted) TERM(search_id=*) 23 | | fields + _time host user search_id 24 | $selection_earliest$ 25 | $selection_latest$ 26 | 27 | 28 | $splunkd$ TERM(server=$selected_host$) sourcetype=splunkd_remote_searches (Streamed search search starting) OR (Streamed search connection terminated) 29 | | eval stage=if(searchmatch("starting"),1,0) 30 | | table _time host search_id stage 31 | 32 | $selection_earliest$ 33 | $selection_latest$ 34 | 35 | 36 | | union 37 | [ search $splunkd$ component=CMMaster ((Starting a rolling restart of the peers) OR (restart rolling finished) OR (scheduled rebalance primaries)) 38 | | eval annotation_label= case(searchmatch("finished"),"Finished RR", searchmatch("rebalance"), "Trigger Rebalance", searchmatch("Starting"),"Starting RR"), annotation_category= case(searchmatch("finished"),"Finished", searchmatch("rebalance"), "Rebalance", searchmatch("Starting"),"Starting") 39 | | table _time anno* _raw 40 | ] 41 | [| tstats count prestats=true where $filter_important_hosts$ $splunkd$ sourcetype=splunkd TERM(splunkd) TERM(starting) TERM(build) by _time host span=1s 42 | | eval type="startup" 43 | | tstats count prestats=true append=true where $filter_important_hosts$ $splunkd$ sourcetype=splunkd TERM(ServerConfig) TERM(My) TERM(is) TERM(GUID) TERM(generated) by _time host span=1s 44 | | eval type=if(isNull(type),"guid change",type) 45 | | tstats count prestats=true append=true where $filter_important_hosts$ $splunkd$ sourcetype=splunkd TERM(Shutting) TERM(down) TERM(splunkd) by _time host span=1s 46 | | eval type=if(isNull(type),"shutdown",type) 47 | | rex field=host "^(?<host_short>[^.]+)" 48 | | stats count by host_short type _time 49 | | rex field=queue "^(?<queue_short>[^q]+)queue" 50 | | eval annotation_label=type." on ".host_short, annotation_category=type 51 | | sort - _time 52 | | fields _time host_short anno* ] 53 | [ search $splunkd$ host=c0m1* log_level=info TERM(from=*) TERM(to=*) TERM(reason=*) transitioning (up OR down OR restarting) 54 | | eval host=peer_name 55 | | search $filter_important_hosts$ 56 | | rex field=peer_name "^(?<short_name>[^\.]+)" 57 | | eval annotation_label=from."->".to." ".short_name, 58 | annotation_category="CMPeer - ".reason 59 | | table _time annotation_label annotation_category] 60 | | table * 61 | $time.earliest$ 62 | $time.latest$ 63 | 64 | 65 | 66 | $result.hosts_IN$ 67 | $result.indexer_IN$ 68 | $result.cluster_master_IN$ 69 | $result.search_heads_IN$ 70 | 71 | $splunkd$ INFO TERM(instance_roles=*) (search_head OR cluster_master OR indexer) sourcetype=splunkd TERM(group=instance) 72 | | fields host instance_roles index_cluster_label 73 | | eval search_head=if(like(instance_roles,"%search_head%"),1,0), 74 | cluster_master=if(like(instance_roles,"%cluster_master%"),1,0), 75 | indexer=if(like(instance_roles,"%indexer%"),1,0) 76 | | stats 77 | values(host) as all_hosts 78 | values(eval(if(search_head=1,host,""))) as search_heads 79 | values(eval(if(cluster_master=1,host,""))) as cluster_master 80 | values(eval(if(indexer=1,host,""))) as indexer 81 | | eval 82 | search_heads_IN="host IN (".mvjoin(mvfilter(search_heads!=""), ", ").")", 83 | cluster_master_IN="host IN (".mvjoin(mvfilter(cluster_master!=""), ", ").")", 84 | indexer_IN="host IN (".mvjoin(mvfilter(indexer!=""), ", ").")", 85 | hosts_IN="host IN (".mvjoin(mvfilter(all_hosts!=""), ", ").")", 86 | 87 | search_heads_OR="(host=".mvjoin(mvfilter(search_heads!=""), " OR host=").")", 88 | cluster_master_OR="(host=".mvjoin(mvfilter(cluster_master!=""), " OR host=").")", 89 | indexer_OR="(host=".mvjoin(mvfilter(indexer!=""), " OR host=").")", 90 | hosts_OR="(host=".mvjoin(mvfilter(all_hosts!=""), " OR host=").")" 91 | -60m@m 92 | now 93 | 94 |
95 | 96 | 97 | 98 | -60m@m 99 | now 100 | 101 | 102 | 103 | if(isnull($time.latest$),31, if((round(relative_time(now(), $time.latest$)-relative_time(now(), $time.earliest$))/$selected_bins$)<31,31,round(((relative_time(now(), $time.latest$)-relative_time(now(), $time.earliest$))/$selected_bins$)/31)*31)) 104 | if(isnull($time.latest$),10,if((round(relative_time(now(), $time.latest$)-relative_time(now(), $time.earliest$))/$selected_bins$)<10,10,round(((relative_time(now(), $time.latest$)-relative_time(now(), $time.earliest$))/$selected_bins$)/10)*10)) 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | span= 113 | sec 114 | 31 115 | 116 | 117 | 118 | span= 119 | sec 120 | 10 121 | 122 |
123 | 124 | 125 | host=$selected_host$ recieved timeouts from the following search peers 126 | 127 | 128 | 100 129 | 250 130 | 500 131 | 750 132 | 1000 133 | 500 134 | 135 | 136 | 137 | label 138 | host 139 | 140 | 141 | | eval label=host." (".count.")" 142 | | sort - count 143 | 144 | * 145 | * 146 | 147 | 148 | 149 | * 150 | peer_name= 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | | search host=$selected_host$ $filter_peers$ 159 | | timechart $selected_introspection_span$ bins=1000 limit=0 count by peer_name 160 | 161 | 162 | $start$ 163 | $end$ 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 |

177 | Open this dashboard in a new window with the time selection above 178 |

179 | 180 |
181 |
182 | 183 | 184 | Duration of search outages, select an outage to reveal what the indexers where doing 185 | 186 | 187 | label 188 | host 189 | 190 | | stats count by host 191 | | eval label=host." (".count.")" 192 | | sort - count 193 | 194 | * 195 | * 196 | host= 197 | 198 | 199 | 200 | label 201 | peer_name 202 | 203 | | stats count by peer_name 204 | | eval label=peer_name." (".count.")" 205 | | sort - count 206 | 207 | * 208 | * 209 | peer_name= 210 | 211 | 212 | 213 | | search $filter_search_head$ $filter_search_peer$ 214 | | sort - _time 215 | | transaction peer_name host maxpause=120s maxevents=10000000 216 | | table _time label duration eventcount host peer_name 217 | | eval duration_string=tostring(duration,"duration") 218 | | eval _realtime=_time 219 | | sort - duration 220 | | rename host as search_head 221 | | rename peer as search_peer 222 | 223 | 224 | 225 | 226 | $row._realtime$-60 227 | $row._realtime$+$row.duration$+60 228 | $row.search_head$ 229 | $row.search_peer$ 230 | 231 |
232 |
233 |
234 | 235 | 236 | The number of searches executed by the search peers from search head 237 | 238 | 239 | label 240 | search 241 | 242 | $splunkd$ CMMaster status=success site* earliest=-4hr latest=now 243 | | rex field=_raw max_match=64 "(?<site_pair>site\d+,\"?[^\",]+)" 244 | | eval cluster_master=host 245 | | fields + site_pair cluster_master 246 | | fields - _* 247 | | dedup site_pair 248 | | mvexpand site_pair 249 | | dedup site_pair 250 | | rex field=site_pair "^(?<site_id>site\d+),\"?(?<indexer>.*)" 251 | | rex field=cluster_master "^(?<short_name_cm>[^\.]+)" 252 | | eval search="host=".indexer, host_count=1 253 | | appendpipe 254 | [| stats values(indexer) as indexers by site_id short_name_cm 255 | | eval host_count=mvcount(indexers), 256 | search="host IN (".mvjoin(mvfilter(indexers!=""), ", ").")" 257 | | eval label=site_id." (".host_count." idxs @ ".short_name_cm 258 | ] 259 | | appendpipe 260 | [| stats values(indexer) as indexers dc(site_id) as site_count by short_name_cm 261 | | eval host_count=mvcount(indexers), 262 | search="host IN (".mvjoin(mvfilter(indexers!=""), ", ").")" 263 | | eval label=short_name_cm." (".host_count." idx ".site_count." sites)" ] 264 | | rex field=indexer "^(?<short_name_idx>[^\.]+)" 265 | | eval label=if(isnull(label), short_name_idx." (".site_id."@".short_name_cm.")", label) 266 | | stats max(host_count) as count by label search 267 | | sort 0 - count 268 | -24h@h 269 | now 270 | 271 | 272 | 273 | 274 | $selected_indexers$ 275 | 276 | 277 | None 278 | None 279 | 280 | 281 | 282 | host=* 283 | 284 | 285 | 286 | * 287 | 288 | | fields + search_heads 289 | | mvexpand search_heads 290 | | where search_heads!="" 291 | 292 | * 293 | search_heads 294 | search_heads 295 | 296 | 297 | 298 | $splunkd$ $selected_targets$ sourcetype=splunkd_remote_searches Streamed search TERM(starting:) TERM(search_id=remote_$selected_search_head$*) 299 | | rex field=_raw "search_id=remote_(?<search_head>[^_]+)_" 300 | | chart limit=0 count by host search_head 301 | $selected_earliest$ 302 | $selected_latest$ 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 |
321 | -------------------------------------------------------------------------------- /default/data/ui/views/debug_replication.xml: -------------------------------------------------------------------------------- 1 |
2 | 3 | Another groovy operations dashboard by Richard Morgan productions 4 | 5 | 62 6 | (index=core_splunk_internal OR index=_internal) 7 | 8 | 9 | $_internal$ sourcetype=splunkd $selected_targets$ replication queue for TERM(peer=*) (BucketReplicator full) OR (has room now) 10 | | join peer 11 | [| loadjob $guid_to_host_mapping$] 12 | | rex field=_raw "bid=(?<idx>[^~]+)" 13 | | eval state=if(searchmatch("BucketReplicator full"), "blocked", "unblocked"), 14 | end=if(searchmatch("has room now"), true(), NULL) 15 | | table _time host bid peer _raw indexer idx state 16 | 17 | $time.earliest$ 18 | $time.latest$ 19 | 1 20 | 21 | $job.sid$ 22 | 23 | 24 | 25 | 26 | | dbinspect index=_internal 27 | | stats count by guId splunk_server 28 | | rename splunk_server as indexer 29 | | rename guId as peer 30 | -15m 31 | now 32 | 33 | $job.sid$ 34 | 35 | 36 | $job.sid$ 37 | 38 | 39 | 40 | 41 | | loadjob $bucketreplicator_sid$ 42 | | search idx=$selected_idx$ 43 | | transaction host bid startswith=eval(state="blocked") endswith=eval(state="unblocked") mvraw=true 44 | $earliest$ 45 | $latest$ 46 | 47 |
48 | 49 | 50 | label 51 | search 52 | 53 | $_internal$ sourcetype=splunkd CMMaster status=success site* earliest=-4hr latest=now 54 | | rex field=_raw max_match=64 "(?<site_pair>site\d+,\"?[^\",]+)" 55 | | eval cluster_master=host 56 | | fields + site_pair cluster_master 57 | | fields - _* 58 | | dedup site_pair 59 | | mvexpand site_pair 60 | | dedup site_pair 61 | | rex field=site_pair "^(?<site_id>site\d+),\"?(?<indexer>.*)" 62 | | rex field=cluster_master "^(?<short_name_cm>[^\.]+)" 63 | | eval search="host=".indexer, host_count=1 64 | | appendpipe 65 | [| stats values(indexer) as indexers by site_id short_name_cm 66 | | eval host_count=mvcount(indexers), 67 | search="host IN (".mvjoin(mvfilter(indexers!=""), ", ").")" 68 | | eval label=site_id." (".host_count." idxs @ ".short_name_cm 69 | ] 70 | | appendpipe 71 | [| stats values(indexer) as indexers dc(site_id) as site_count by short_name_cm 72 | | eval host_count=mvcount(indexers), 73 | search="host IN (".mvjoin(mvfilter(indexers!=""), ", ").")" 74 | | eval label=short_name_cm." (".host_count." idx ".site_count." sites)" ] 75 | | rex field=indexer "^(?<short_name_idx>[^\.]+)" 76 | | eval label=if(isnull(label), short_name_idx." (".site_id."@".short_name_cm.")", label) 77 | | stats max(host_count) as count by label search 78 | | sort 0 - count 79 | -24h@h 80 | now 81 | 82 | 83 | 84 | 85 | $selected_indexers$ 86 | 87 | 88 | None 89 | None 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | -60m@m 98 | now 99 | 100 | 101 | if((round(relative_time(now(), $time.latest$)-relative_time(now(), $time.earliest$))/$time_resolution$)<31,31,round((relative_time(now(), $time.latest$)-relative_time(now(), $time.earliest$))/$time_resolution$)) 102 | 103 | 104 | 105 | 106 | Crude 107 | Low 108 | Medium 109 | High 110 | Ultra 111 | 500 112 | 113 | if((round(relative_time(now(), $time.latest$)-relative_time(now(), $time.earliest$))/$time_resolution$)<31,31,round((relative_time(now(), $time.latest$)-relative_time(now(), $time.earliest$))/$time_resolution$)) 114 | 115 | 116 |
117 | 118 | 119 | 120 |

Please select an cluster, site or host

121 | 122 |
123 |
124 | 125 | 126 | BucketReplicator full messages sending to indexers 127 | 128 | 129 | Indexer complaining 130 | The problem indexer 131 | indexer 132 | 133 | 134 | 135 | * 136 | 137 | | stats count by idx 138 | | sort - count 139 | | eval label=idx." (".count.")" 140 | 141 | * 142 | label 143 | idx 144 | 145 | 146 | 147 | | search BucketReplicator full idx=$selected_idx$ 148 | | chart limit=50 count by $selected_replication_indexer$ idx 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | $click.name2$ 185 | 186 | 187 | 188 | 189 | 190 | 191 | Timeline of delays $selected_idx$ 192 | 193 | 194 | Complaining indexer 195 | Remote indexer 196 | Bucket ID 197 | indexer 198 | 199 | 200 | 201 | | xyseries _time $selected_timeline_attribute$ duration 202 | 203 | 204 | 205 | 206 | 207 | $row.bid$ 208 | 209 | 210 | $start$ 211 | $end$ 212 | 213 | 214 | 215 | 216 | 217 | 218 | Selected summary $selected_idx$ 219 | 220 | 221 | local, remote 222 | remote, local 223 | remote, bid 224 | local, bid 225 | remote 226 | local 227 | indexer host 228 | 229 | 230 | 231 | total duration blocked 232 | average duration blocked 233 | count 234 | sum(duration) 235 | 236 | 237 | 238 | | where _time > $selected_earliest$ and _time < $selected_latest$ 239 | | chart limit=80 $selected_summary_aggregator$ by $selected_summary_attribute$ 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 |
250 | -------------------------------------------------------------------------------- /default/data/ui/views/debug_search.xml: -------------------------------------------------------------------------------- 1 |
2 | 3 | 4 | 5 | $hostname_tstats_done$ 6 | | where false() 7 | | inputcsv dispatch=true search.log append=true 8 | | foreach * [| eval _raw='<<FIELD>>'] 9 | | fields + _raw 10 | | eval _time = strptime(_raw,"%m-%d-%Y %H:%M:%S.%Q") 11 | | rex field=_raw "\s(?<log_level>INFO|DEBUG|WARN|FATAL|ERROR)\s+(?<component>[A-Za-z]+)" 12 | 13 | 14 | * 15 | 16 |
17 | 18 | 19 | $hostname_tstats$ seconds - $hostname_tstats_sid$ 20 | 21 | 22 | 23 | -24h@h 24 | now 25 | 26 | 27 | 28 | $hostname_tstats_query$ 29 | 30 | 31 | $job.sid$ 32 | 33 | $job.runDuration$ 34 | $job.search$ 35 | 36 | 37 | $job.sid$ 38 | | noop 39 | 40 | | tstats 41 | count 42 | where index=_internal Metrics sourcetype=splunkd TERM(group=tcpin_connections) 43 | by PREFIX(hostname=) 44 | $time.earliest$ 45 | $time.latest$ 46 | 1 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | duration dispatch remote 86 | 87 | 88 | | where level1="duration" AND level2="dispatch" AND level3="stream" AND level4="remote" and level5!="" 89 | | chart limit=0 sum(value) as value by level5 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | out_ct dispatch stream remote 98 | 99 | 100 | | where level1="out_ct" AND level2="dispatch" AND level3="stream" AND level4="remote" AND level5!="" 101 | | chart limit=0 sum(value) as value by level5 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | Duration dispatch 112 | 113 | 114 | | where level1="duration" AND level2="dispatch" 115 | | chart limit=0 sum(value) as value by level3 level4 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | Duration dispatch 125 | 126 | 127 | | where level1="duration" AND level2="command" 128 | | chart limit=0 sum(value) as value by level3 level4 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | Duration dispatch 138 | 139 | 140 | | where level1="duration" AND level2="dispatch" AND level3="evaluate" 141 | | chart limit=0 sum(value) as value by level4 level5 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | Duration dispatch 150 | 151 | 152 | | where level1="duration" AND level2="dispatch" 153 | | chart limit=0 sum(value) as value by level3 level4 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | Search log for above search 165 | 166 | 167 | label 168 | component 169 | 170 | | stats count by component | eval label=component." (".count.")" 171 | | sort - count 172 | 173 | * 174 | * 175 | , 176 | component IN ( 177 | ) 178 | 179 | 180 | 181 | | search $selected_components$ 182 | | timechart span=100ms limit=0 fixedrange=false count by component 183 | 184 | 185 | 186 | 187 | 188 | 189 | $click.name2$ 190 | 191 | 192 | $start$ 193 | $end$ 194 | 195 | 196 | 197 | 198 | 199 | 200 | Search log for above search 201 | 202 | 203 | | search _time>=$selection_earliest$ AND _time<=$selection_latest$ AND $selected_components$ 204 | 205 | 206 |
207 |
208 |
209 | 210 | | where false() 211 | | inputcsv append=true dispatch=true info.csv 212 | | eval countMap=_countMap 213 | | table host countMap 214 | | rex field=countMap max_match=1000 "(?<name>[A-Za-z.\d_]+;\d+;)" 215 | | table name 216 | | mvexpand name 217 | | rex field=name "^(?<level1>[^.]+)(\.(?<level2>[^.]+))?(\.(?<level3>[^.]+))?(\.(?<level4>[^.]+))?(\.(?<level5>[^.]+))?(\.(?<level6>[^.]+))?;(?<value>\d+);" 218 | | fillnull level3 value="" 219 | | fillnull level4 value="" 220 | | fillnull level5 value="" 221 | | fillnull level6 value="" 222 | 223 |
224 | -------------------------------------------------------------------------------- /default/data/ui/views/discovery_forwarding_hierrachy.xml: -------------------------------------------------------------------------------- 1 |
2 | 3 | 4 | 5 | $search_method$ 6 | 7 | $time.earliest$ 8 | $time.latest$ 9 | 10 |
11 | 12 | 13 | 14 | -15m 15 | now 16 | 17 | 18 | 19 | 20 | Use SPL search and save 21 | Use saved lookup 22 | | inputcsv hosts_to_roles.csv 23 | 24 | 25 | 26 | ack 27 | endpoint_arch 28 | endpoint_build 29 | endpoint_os 30 | endpoint_version 31 | fwdType 32 | pipelines 33 | role 34 | ssl 35 | sum_kb 36 | NONE 37 | | noop 38 | 39 |
40 | 41 | 42 | Forwarders sending data with acknowledgements enabled 43 | 44 | 45 | | chart count by ack $split_by$ 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | Forwarders by OS version 56 | 57 | 58 | | chart count by endpoint_os $split_by$ 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | Forwarders reporting CPU architecture 69 | 70 | 71 | | chart dc(host) as count by endpoint_arch $split_by$ 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | Reversed engineered role 84 | 85 | We consider an intermediate forwarder to have inputs and outputs, endpoints to have outputs only, and indexers inputs only 86 | 87 | | chart limit=100 count by role $split_by$ 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | Forwarders sending data with SSL enabled 98 | 99 | Using SSL 100 | 101 | | chart limit=100 count by ssl $split_by$ 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | Comparitive sizing of forwarders 112 | 113 | Not all forwarders are born equal MBs in period 114 | 115 | | eval sum_mb=sum_kb/1024 | bin bins=50 sum_mb | chart limit=100 count by sum_mb $split_by$ 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | Forwarder versions detected in logs 127 | 128 | 129 | | chart count by endpoint_version $split_by$ 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | How many targets did the forwarders connect to during the time period scanned? 142 | 143 | 144 | | chart count by target_count $split_by$ 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | Forwarders by version, host and connections 157 | 158 | 159 | Endpoints 160 | Intermediates 161 | Both 162 | unfiltered 163 | role="intermediate" OR role="endpoint" 164 | 165 | 166 | Note once run this search dumps out a lookup table called "hosts_to_roles.csv" for faster access 167 | 168 | | search $role_filter$ 169 | 170 | 171 | 172 |
173 |
174 |
175 |
176 | -------------------------------------------------------------------------------- /default/data/ui/views/discovery_searches.xml: -------------------------------------------------------------------------------- 1 |
2 | 3 | 4 | 62 5 | (index=core_splunk_internal OR index=_internal) Metrics (sourcetype=splunkd OR sourcetype=metrics) 6 | (index=core_splunk_internal OR index=_internal) sourcetype=splunkd 7 | 8 |
9 | 10 | 11 | index=_internal 12 | 13 |
14 | 15 | 16 | Indexer discovery via REST 17 | 18 | This method is flawed in that the REST API doesn't normally query all the SHs or the CM 19 | 20 | | rest /services/server/info 21 | | eval search_head=if(like(server_roles,"%search_head%"),1,0), 22 | cluster_master=if(like(server_roles,"%cluster_master%"),1,0), 23 | indexer=if(like(server_roles,"%indexer%"),1,0) 24 | | stats values(eval(if(search_head=1,host,""))) as search_heads 25 | values(eval(if(cluster_master=1,host,""))) as cluster_master 26 | values(eval(if(indexer=1,host,""))) as indexer 27 | by cluster_label 28 | | eval search_heads="host IN (".mvjoin(mvfilter(search_heads!=""), ", ").")", 29 | cluster_master="host IN (".mvjoin(mvfilter(cluster_master!=""), ", ").")", 30 | indexer="host IN (".mvjoin(mvfilter(indexer!=""), ", ").")" 31 | -2min 32 | -1min 33 | 34 | 35 | 36 |
37 |
38 |
39 | 40 | 41 | Component discovery via roles found in Metrics 42 | 43 | This requires over a 60 min scan to get all the events 44 | 45 | $selected_internal$ INFO TERM(instance_roles=*) (search_head OR cluster_master OR indexer) sourcetype=splunkd TERM(group=instance) 46 | | fields host instance_roles index_cluster_label 47 | | eval search_head=if(like(instance_roles,"%search_head%"),1,0), 48 | cluster_master=if(like(instance_roles,"%cluster_master%"),1,0), 49 | indexer=if(like(instance_roles,"%indexer%"),1,0) 50 | | stats values(eval(if(search_head=1,host,""))) as search_heads 51 | values(eval(if(cluster_master=1,host,""))) as cluster_master 52 | values(eval(if(indexer=1,host,""))) as indexer by index_cluster_label 53 | | eval search_heads="host IN (".mvjoin(mvfilter(search_heads!=""), ", ").")", 54 | cluster_master="host IN (".mvjoin(mvfilter(cluster_master!=""), ", ").")", 55 | indexer="host IN (".mvjoin(mvfilter(indexer!=""), ", ").")" 56 | -60m@m 57 | now 58 | 59 | 60 | 61 | 62 |
63 |
64 |
65 | 66 | 67 | Cluster master from Metrics using "cmmaster_service" 68 | 69 | This is efficient in that is requires scanning over 30 seconds 70 | 71 | $selected_internal$ Metrics Info TERM(name=cmmaster_service) sourcetype=splunkd TERM(group=subtask_counts) 72 | | stats values(host) as search 73 | | eval search="host IN (".mvjoin(search, ", ").")" 74 | | fields + search 75 | -2min 76 | -1min 77 | 78 | 79 |
80 |
81 |
82 | 83 | 84 | Search head cluster members via Metrics 85 | 86 | 87 | index=_internal earliest=-90min latest=now sourcetype=splunkd Metrics TERM(group=instance) TERM(name=instance) (cluster_search_head OR search_head) 88 | | stats 89 | count 90 | values(instance_roles) as roles 91 | values(index_cluster_label) as idx_cluster_name 92 | by instance_guid server_name 93 | | eval mv_roles=split(roles, ", ") 94 | | eval idx_cluster_name=if(idx_cluster_name="none", "", idx_cluster_name) 95 | | stats 96 | count(eval(mv_roles="indexer")) as role_indexer 97 | count(eval(mv_roles="search_peer")) as role_search_peer 98 | count(eval(mv_roles="cluster_search_head")) as role_cluster_search_head 99 | count(eval(mv_roles="search_head")) as role_search_head 100 | count(eval(mv_roles="cluster_slave")) as role_cluster_slave 101 | count(eval(mv_roles="kv_store")) as role_kv_store_role 102 | count(eval(mv_roles="cluster_master")) as role_cluster_master 103 | count(eval(mv_roles="license_master")) as role_license_master 104 | count(eval(mv_roles="deployment_server_master")) as role_deployment_server 105 | count(eval(mv_roles="deployment_client")) as role_deployment_client 106 | count(eval(mv_roles="shc_captain")) as role_shc_captain 107 | count(eval(mv_roles="shc_member")) as role_shc_member 108 | by instance_guid server_name roles idx_cluster_name 109 | | stats values(server_name) as search 110 | | eval search="host IN ".mvjoin(search, ", ").")" 111 | -24h@h 112 | now 113 | 114 | 115 | 116 | 117 |
118 |
119 |
120 | 121 | 122 | REST API 123 | 124 | Site discovery 125 | 126 | | rest /services/cluster/config 127 | | rex field=master_uri "https://(?<cluster_master>[^:]+)" 128 | | stats dc(splunk_server) as host_count values(splunk_server) as indexer by site cluster_master cluster_label 129 | | eval indexer="host IN (".mvjoin(mvfilter(indexer!=""), ", ").")" 130 | -24h@h 131 | now 132 | 133 | 134 | 135 | 136 |
137 |
138 |
139 | 140 | 141 | Sites by CMMaster bucket creation messages 142 | 143 | 144 | 145 | -4h@m 146 | now 147 | 148 | 149 | 150 | The time range may vary based on the number of bucket rolls 151 | 152 | $selected_internal$ sourcetype=splunkd CMMaster status=success site* 153 | | rex field=message max_match=64 "(?<site_pair>site\d+,\"[^\"]+)" 154 | | eval cluster_master=host 155 | | fields + site_pair cluster_master 156 | | fields - _* 157 | | dedup site_pair 158 | | mvexpand site_pair 159 | | dedup site_pair 160 | | rex field=site_pair "^(?<site_id>site\d+),\"(?<indexer>.*)" 161 | | stats values(indexer) as indexer by site_id cluster_master 162 | | eval host_count=mvcount(indexer), indexer="host IN (".mvjoin(mvfilter(indexer!=""), ", ").")" 163 | $time_site_cmmaster.earliest$ 164 | $time_site_cmmaster.latest$ 165 | 166 | 167 | 168 | 169 |
170 |
171 |
172 | 173 | 174 | The host as reported by dbinspect, great if you don't have access to internal logs or the REST API 175 | 176 | 177 | | dbinspect index=* 178 | | stats values(splunk_server) as indexer 179 | | eval host_count=mvcount(indexer), indexer="host IN (".mvjoin(mvfilter(indexer!=""), ", ").")" 180 | -1hr 181 | now 182 | 183 | 184 | 185 |
186 |
187 |
188 | 189 | 190 | 191 | Get sites and clusters - use for debugging way discovery doesn't work 192 | 193 | $splunkd$ CMMaster status=success site* earliest=-4hr latest=now source=*splunkd.log* 194 | | rex field=_raw max_match=64 "(?<site_pair>site\d+,\"?[^\",]+)" 195 | | rex field=_raw "peer_name=(?<single_site>[^\s]+)" 196 | | eval site_pair=if(isnull(site_pair),"site,".single_site,site_pair) 197 | | eval cluster_master=host 198 | | fields + site_pair cluster_master 199 | | fields - _* 200 | | dedup site_pair 201 | | mvexpand site_pair 202 | | dedup site_pair 203 | | rex field=site_pair "^(?<site_id>site[^,]*),\"?(?<indexer>.*)" 204 | | rex field=cluster_master "^(?<short_name_cm>[^\.]+)" 205 | | eval search="host=".indexer, host_count=1 206 | | appendpipe 207 | [| stats values(indexer) as indexers by site_id short_name_cm 208 | | eval host_count=mvcount(indexers), 209 | search="host IN (".mvjoin(mvfilter(indexers!=""), ", ").")" 210 | | eval label=site_id." (".host_count." idxs @ ".short_name_cm 211 | ] 212 | | appendpipe 213 | [| stats values(indexer) as indexers dc(site_id) as site_count by short_name_cm 214 | | eval host_count=mvcount(indexers), 215 | search="host IN (".mvjoin(mvfilter(indexers!=""), ", ").")" 216 | | eval label=short_name_cm." (".host_count." idx ".site_count." sites)" ] 217 | | rex field=indexer "^(?<short_name_idx>[^\.]+)" 218 | | eval label=if(isnull(label), short_name_idx." (".site_id."@".short_name_cm.")", label) 219 | | stats max(host_count) as count by label search 220 | | sort 0 - count 221 | -24h@h 222 | now 223 | 224 | 225 | 226 |
227 |
228 |
229 |
230 | -------------------------------------------------------------------------------- /default/data/ui/views/event_delay_for_host.xml: -------------------------------------------------------------------------------- 1 |
2 | 3 | 4 | 5 | if((round(relative_time(now(), $time.latest$)-relative_time(now(), $time.earliest$))/$time_resolution$)<1,1,round((relative_time(now(), $time.latest$)-relative_time(now(), $time.earliest$))/$time_resolution$)) 6 | 7 | 8 | 9 | | makeresults 10 | 11 | if((round(relative_time(now(), $time.latest$)-relative_time(now(), $time.earliest$))/$time_resolution$)<1,1,round((relative_time(now(), $time.latest$)-relative_time(now(), $time.earliest$))/$time_resolution$)) 12 | 13 | 14 | 15 | | tstats count where index=_internal host=$selected_host$ | where count!=0 16 | $time.earliest$ 17 | $time.latest$ 18 | 19 | 20 | 21 | 22 | 23 | $result.count$ 24 | 25 | 26 | 27 | 28 | | tstats count where index=_introspection component::Hostwide host=$selected_host$ | where count!=0 29 | $time.earliest$ 30 | $time.latest$ 31 | 32 | 33 | 34 | 35 | 36 | $result.count$ 37 | 38 | 39 | 40 | 41 | | tstats count where index=_introspection component::hec* host=$selected_host$ | where count!=0 42 | $time.earliest$ 43 | $time.latest$ 44 | 45 | 46 | 47 | 48 | 49 | $result.count$ 50 | 51 | 52 | 53 | 54 | 55 | | tstats max(_indextime) AS indexed_time count where host=$selected_host$ (index=* OR index=_*) latest=now earliest=$time.earliest$ _index_latest=$time.latest$ _index_earliest=$time.earliest$ by index host sourcetype splunk_server _time span=$seconds_for_bin$ 56 | | eval _time=round(_time), bin_delay=indexed_time-_time 57 | | bin span=2log5 bin_delay 58 | 59 | $time.earliest$ 60 | $time.latest$ 61 | 62 |
63 | 64 | 65 | 66 | -24h@h 67 | now 68 | 69 | 70 | if((round(relative_time(now(), $time.latest$)-relative_time(now(), $time.earliest$))/$time_resolution$)<1,1,round((relative_time(now(), $time.latest$)-relative_time(now(), $time.earliest$))/$time_resolution$)) 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | All 79 | ( 80 | ) 81 | index=" 82 | " 83 | OR 84 | 85 | | stats count by index 86 | | eval label=index." (".count.")" 87 | 88 | * 89 | index 90 | index 91 | 92 | 93 | 94 | Crude 95 | Low 96 | Medium 97 | High 98 | Ultra 99 | 500 100 | 101 |
102 | 103 | 104 | 105 |

106 |
107 | Please enter value for host 108 |
109 |

110 | 111 |
112 |
113 | 114 | 115 | 116 |

117 |
118 | Event metadata for $selected_host$ 119 |
120 |

121 | 122 |
123 |
124 | 125 | 126 | $show_introspection_link$ 127 | 128 |

129 |
130 | Hyperlink to _introspection metrics 131 |
132 |

133 | 134 |
135 | 136 | depends="$show_internal_link$" 137 | 138 |

139 |
140 | Hyperlink to _internal logs 141 |
142 |

143 | 144 |
145 | 146 | depends="$show_introspection_hec_link$" 147 | 148 |

149 |
150 | Show HEC metrics 151 |
152 |

153 | 154 |
155 |
156 | 157 | 158 | Count of events generated at transmission time (_time) 159 | 160 | How many events where generated by $selected_host$ time x by index? 161 | 162 | | search $index_filter$ 163 | | timechart limit=0 span=$seconds_for_bin$sec sum(count) by index 164 | 165 | 166 | $start$ 167 | $end$ 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | Count of events generated by received time (_indextime) by indexer? 181 | 182 | How many events where received from $selected_host$ time x by index? 183 | 184 | | search $index_filter$ 185 | | eval _time=indexed_time 186 | | timechart limit=0 span=$seconds_for_bin$sec sum(count) by index 187 | 188 | 189 | $start$ 190 | $end$ 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | Delay of events by generation time (_time) 208 | 209 | When $selected_host$ generated events, how long did it take before they were indexed? 210 | 211 | | search $index_filter$ 212 | | eval bin_delay=if(bin_delay<0, "future", bin_delay) 213 | | timechart limit=0 span=$seconds_for_bin$sec sum(count) by bin_delay 214 | 215 | 216 | $start$ 217 | $end$ 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | Delay of events by received time (_indexedtime) 233 | 234 | 235 | | search $index_filter$ 236 | | eval _time=indexed_time 237 | | eval bin_delay=if(bin_delay<0, "future", bin_delay) 238 | | timechart limit=0 span=$seconds_for_bin$sec sum(count) by bin_delay 239 | 240 | 241 | $start$ 242 | $end$ 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | Count of events generated at transmission time (_time) 260 | 261 | How many events where generated by $selected_host$ time x by index? 262 | 263 | | search $index_filter$ 264 | | timechart limit=0 span=$seconds_for_bin$sec sum(count) by sourcetype 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | Count of events generated by received time (_indextime) by indexer? 275 | 276 | Count of events generated by received time (_indextime) by indexer? 277 | 278 | | search $index_filter$ 279 | | eval _time=indexed_time 280 | | eval bin_delay=if(bin_delay<0, "future", bin_delay) 281 | | timechart limit=0 span=$seconds_for_bin$sec sum(count) by sourcetype 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | Which indexer received the event from $selected_host$ 294 | 295 | 296 | | search $index_filter$ 297 | | eval _time=indexed_time 298 | | timechart limit=0 span=$seconds_for_bin$sec sum(count) by splunk_server 299 | 300 | 301 | $start$ 302 | $end$ 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | 313 | some notes 314 | 315 | 316 |
317 | -------------------------------------------------------------------------------- /default/data/ui/views/event_delay_for_index.xml: -------------------------------------------------------------------------------- 1 |
2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | | tstats max(_indextime) as indexed_time count where $filter_indexes$ latest=+100day earliest=-300day _index_latest=$time.latest$ _index_earliest=$time.earliest$ by index host splunk_server _time span=1s 13 | | eval _time=round(_time), delay=indexed_time-_time, delay_str=tostring(delay,"duration") 14 | | eventstats max(delay) as max_delay max(_time) as max_time count as eps by host index 15 | | where max_delay = delay 16 | | eval max_time=_time 17 | | sort - delay 18 | 19 | 20 | 21 | 22 | -100d@d 23 | now 24 | 1 25 | 26 | 27 | | tstats max(_indextime) AS indexed_time count where host=$selected_endpoint$ index=* latest=now earliest=-24hrs _index_latest=$time_indexer.latest$ _index_earliest=$time_indexer.earliest$ by index host splunk_server _time source span=10s 28 | | eval _time=round(_time), bin_delay=indexed_time-_time 29 | | bin bins=10 bin_delay 30 | 31 | 32 | 33 | -4h@m 34 | now 35 | 36 |
37 | 38 | 39 | 40 | -2sec 41 | -1sec 42 | 43 | 44 | 45 | 46 | ( 47 | ) 48 | index=" 49 | " 50 | OR 51 | label 52 | index 53 | 54 | | eventcount summarize=f index=* index=_* | stats sum(count) as count by index | eval label=index." (".count.")" | sort - count | fields + index label 55 | -24h@h 56 | now 57 | 58 | _internal 59 | 60 |
61 | 62 | 63 | $aggregation_function_lhs$ delay by index 64 | 65 | 66 | average 67 | mode 68 | median 69 | mean 70 | standard dev 71 | per_second 72 | p90 73 | p95 74 | p99 75 | max 76 | min 77 | sum 78 | range 79 | variance 80 | avg 81 | 82 | 83 | 84 | | stats mode(delay) by index 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | $row.index$ 93 | 94 | 95 | 96 | 97 | $aggregation_function_rhs$ delay by index 98 | 99 | 100 | average 101 | mode 102 | median 103 | mean 104 | standard dev 105 | per_second 106 | p90 107 | p95 108 | p99 109 | max 110 | min 111 | sum 112 | range 113 | variance 114 | p95 115 | 116 | 117 | 118 | | stats p95(delay) by index 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | $row.index$ 127 | 128 | 129 | 130 | 131 | 132 | 133 | Event count per index (log scale) 134 | 135 | 136 | | sort - delay | stats sum(count) by index 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | $row.index$ 145 | 146 | 147 | 148 | 149 | Number of hosts in each index (log scale) 150 | 151 | 152 | | stats dc(host) by index 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | $row.index$ 162 | 163 | 164 | 165 | 166 | 167 | 168 | Event distribution 169 | 170 | 171 | | stats sum(count) as events_per_indexer by splunk_server index 172 | | xyseries splunk_server index events_per_indexer 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | $click.name2$ 181 | $row.splunk_server$ 182 | 183 | 184 | 185 | 186 | 187 | 188 | Hosts sending to $selected_index$ with the delay and number of events sent during period $selected_indexer$ 189 | 190 | 191 | * 192 | 193 | 194 | 195 | 196 | -24h@h 197 | now 198 | 199 | 200 | 201 | 202 | | where index="$selected_index$" 203 | | search host=$host_filter$ 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | event_delay_for_host?form.time.earliest=$drilldown.earliest$&form.time.latest=$drilldown.latest$&form.selected_host=$row.host$ 214 | 215 |
216 |
217 |
218 |
219 | -------------------------------------------------------------------------------- /default/data/ui/views/event_delay_index_sourcetype.xml: -------------------------------------------------------------------------------- 1 |
2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | | tstats max(_indextime) as indexed_time count where $filter_sourcetypes$ $filter_indexes$ latest=$time.latest$ earliest=$time.earliest$ _index_latest=$time.latest$ _index_earliest=$time.earliest$ by index sourcetype _time span=1s 10 | | eval _time=round(_time), delay=indexed_time-_time, delay_str=tostring(delay,"duration"), index_sourcetype=index."+".sourcetype 11 | 12 | 13 | $time.earliest$ 14 | $time.latest$ 15 | 1 16 | 17 |
18 | 19 | 20 | 21 | -24h@h 22 | now 23 | 24 | 25 | 26 | 27 | ( 28 | ) 29 | index=" 30 | " 31 | OR 32 | label 33 | index 34 | 35 | | eventcount summarize=f index=* index=_* | stats sum(count) as count by index | eval label=index." (".count.")" | sort - count | fields + index label 36 | -24h@h 37 | now 38 | 39 | 40 | 41 | 42 | ( 43 | ) 44 | sourcetype=" 45 | " 46 | OR 47 | label 48 | sourcetype 49 | 50 | | tstats count where $filter_indexes$ latest=+100day earliest=-300day _index_latest=-1sec _index_earliest=-2sec by sourcetype 51 | | eval label=sourcetype." (".count.")" 52 | -24h@h 53 | now 54 | 55 | All 56 | * 57 | 58 |
59 | 60 | 61 | P95 delay by _time 62 | 63 | 64 | | timechart limit=30 bins=600 p95(delay) by index_sourcetype 65 | 66 | 67 | 68 | 69 | 70 | 71 | $row.index_sourcetype 72 | $row._time$ 73 | 74 | 75 | 76 | 77 | 78 | 79 | P95 delay by indexed_time 80 | 81 | 82 | | eval _time=indexed_time | timechart limit=30 bins=600 p95(delay) by index_sourcetype 83 | 84 | 85 | 86 | 87 | 88 | 89 | $row.index_sourcetype 90 | $row._time$ 91 | 92 | 93 | 94 | 95 | 96 | 97 | Hosts sending to $selected_index$ and at $selected_time$ with the delay and number of events sent during period 98 | 99 | 100 | * 101 | 102 | 103 | 104 | | where index="$selected_index$" 105 | | search host=$host_filter$ 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | /app/search/event_delay_for_host?form.selected_host=$row.host$ 116 | 117 |
118 |
119 |
120 |
121 | -------------------------------------------------------------------------------- /default/data/ui/views/event_distribution_measurement.xml: -------------------------------------------------------------------------------- 1 |
2 | 3 | 4 | (index=_internal OR index=core_splunk_internal) sourcetype=splunkd 5 | (index=_internal OR index=core_splunk_internal) (sourcetype=metrics OR sourcetype=splunkd) METRICS 6 | 19 7 | 8 | 9 | 10 | $generated_search$ 11 | 12 | 13 |
14 | 15 | 16 | label 17 | search 18 | 19 | $splunkd$ CMMaster status=success site* earliest=-4hr latest=now source=*splunkd.log* 20 | | rex field=_raw max_match=64 "(?<site_pair>site\d+,\"?[^\",]+)" 21 | | rex field=_raw "peer_name=(?<single_site>[^\s]+)" 22 | | eval site_pair=if(isnull(site_pair),"site,".single_site,site_pair) 23 | | eval cluster_master=host 24 | | fields + site_pair cluster_master 25 | | fields - _* 26 | | dedup site_pair 27 | | mvexpand site_pair 28 | | dedup site_pair 29 | | rex field=site_pair "^(?<site_id>site[^,]*),\"?(?<indexer>.*)" 30 | | rex field=cluster_master "^(?<short_name_cm>[^\.]+)" 31 | | eval search="host=".indexer, host_count=1 32 | | appendpipe 33 | [| stats values(indexer) as indexers by site_id short_name_cm 34 | | eval host_count=mvcount(indexers), 35 | search="host IN (".mvjoin(mvfilter(indexers!=""), ", ").")" 36 | | eval label=site_id." (".host_count." idxs @ ".short_name_cm 37 | ] 38 | | appendpipe 39 | [| stats values(indexer) as indexers dc(site_id) as site_count by short_name_cm 40 | | eval host_count=mvcount(indexers), 41 | search="host IN (".mvjoin(mvfilter(indexers!=""), ", ").")" 42 | | eval label=short_name_cm." (".host_count." idx ".site_count." sites)" ] 43 | | rex field=indexer "^(?<short_name_idx>[^\.]+)" 44 | | eval label=if(isnull(label), short_name_idx." (".site_id."@".short_name_cm.")", label) 45 | | stats max(host_count) as count by label search 46 | | sort 0 - count 47 | -24h@h 48 | now 49 | 50 | 51 | 52 | 53 | $selected_indexers$ 54 | 55 | 56 | None 57 | None 58 | 59 | 60 | 61 | 62 | 63 | 64 | label 65 | index 66 | 67 | | eventcount index=* index=_* summarize=false 68 | | rename server as splunk_server 69 | | search $selected_targets$ 70 | | stats sum(count) as size by index 71 | | sort - size 72 | | eval label=index."(".size.")" 73 | -24h@h 74 | now 75 | 76 | main 77 | ( 78 | ) 79 | index= 80 | OR 81 | 82 | 83 | 84 | 1 85 | 86 | 87 | 88 | 2.8 89 | 90 | 91 | 92 | 60 93 | 60 94 | 95 |
96 | 97 | 98 | 6. Rate of increase of time based on power, select number of iterations 99 | 100 | 101 | 20 102 | 103 | 104 | Rate of time increase in seconds - click on column 105 | 106 | | makeresults 107 | | eval step=mvrange(1,$max_samples_to_offer$,1) 108 | | mvexpand step 109 | | eval "time range in seconds"=$step_size$*pow(step,$power$) 110 | | fields step "time range in seconds" 111 | | fields - _time 112 | -24h@h 113 | now 114 | 115 | 116 | 117 | 118 | $click.value$ 119 | 120 | 121 | 122 | 123 | 7. Click on the duration to execute the search 124 | 125 | When you click this link a search is generated in SPL and written to the base search for execution. The format is "days+hours:minutes:seconds", for example you can translate i.e 10+6:30:00 into 10 days and 6 hours 30 mins. It is best to measure event distribution over shorter lengths for instance an hour, if you aren't getting good event distribution within this time the platform needs tuning. 126 | 127 | 128 | 129 | | makeresults 130 | | eval step=mvrange(1,$steps$,1) 131 | | mvexpand step 132 | | eval step_size=round($step_size$*pow(step,$power$)) 133 | | eval jump=step_size 134 | | fields step_size jump step 135 | | eval tstats_preamble=if(step==1,"| tstats prestats=t","| tstats prestats=t append=t") 136 | | eval tstats_search=" 137 | ".tstats_preamble." count max(_time) as latest_time min(_time) as min_time where earliest=-".(jump+$offset$)."sec latest=-$offset$sec $selected_targets$ $selected_index$ by splunk_server index 138 | | eval period=if(isNull(period),\"".step_size."\",period) 139 | ", step_string=tostring(step_size,"duration") 140 | | fields - jump tstats_preamble 141 | | stats last(step_string) as max_history list(*) as * 142 | | eval tstats_search=mvjoin(tstats_search," ")." | stats count by period splunk_server index" 143 | | eventstats last(step_string) as max_history_str last(step_size) as max_history 144 | | eval step_string=mvjoin(step_string,","), step_size=mvjoin(step_size,","), step=mvjoin(step,",") 145 | | eval post_process="| stats sum(count) as count by period splunk_server index" 146 | | eval padding_search="[| tstats prestats=t count where earliest=-".max_history."sec latest=-60sec $selected_index$ by splunk_server index 147 | | stats count by splunk_server index 148 | | eval count=0 149 | | eval period=\"".step_size."\" 150 | | makemv delim=\",\" period 151 | | mvexpand period]" 152 | | eval search=tstats_search 153 | | fields max_history_str search step_string 154 | -24h@h 155 | now 156 | 157 | 158 | 159 | 160 | 161 | $row.search$ 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | You are viewing $steps$ steps starting with a period of $step_size$ seconds increasing at the power $power$ and spliting the data by index 170 | 171 | 172 | 173 | 174 | 175 | Variation of events across the indexers $selected_index$ 176 | 177 | 178 | label 179 | index 180 | 181 | | stats count by index | eval label=index." (".count.")" 182 | 183 | * 184 | ( 185 | ) 186 | index=" 187 | " 188 | OR 189 | * 190 | 191 | 192 | Variation of events across the indexers 193 | 194 | | search $filter_index$ 195 | | eval period=tostring(period,"duration") 196 | | stats sum(count) as count by period splunk_server 197 | | xyseries splunk_server period count 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | What was the normalised standard deviation over time? 210 | 211 | We want this to bottom out ASAP 212 | 213 | | eventstats dc(splunk_server) as known_servers 214 | | stats dc(splunk_server) as servers_in_period avg(count) as avg var(count) as varience by period index known_servers 215 | | eval missing_servers_in_period = known_servers - servers_in_period, 216 | fixed_variance=(servers_in_period*varience+pow(missing_servers_in_period*avg,2))/known_servers, 217 | fixed_stdev=sqrt(fixed_variance) 218 | | eval normalized_stdev=fixed_stdev/avg 219 | | fields - avg fixed_stdev fixed_variance varience 220 | | chart limit=100 values(normalized_stdev) as normalized_stdev by period index 221 | | eval period=tostring(period,"duration") 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | How many indexers received data during each period? 235 | 236 | We need this to ramp up as quickly as possible 237 | 238 | | where count!=0 239 | | chart limit=100 dc(splunk_server) by period index 240 | | eval period=tostring(period,"duration") 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | Events scanned in each step 252 | 253 | Events scanned goes up quickly 254 | 255 | | chart limit=100 sum(count) as ratio by period index 256 | | eval period=tostring(period,"duration") 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 |
267 | -------------------------------------------------------------------------------- /default/data/ui/views/find_cluster_master_events.xml: -------------------------------------------------------------------------------- 1 |
2 | 3 | 4 | index=_internal sourcetype=splunkd 5 | index=_internal sourcetype=splunkd_crash_log 6 | index=_introspection component::hostwide 7 | 8 | 9 | | eval search="(host=".host." earliest=".(_time-30)." latest=".(_time+30).")" 10 | | stats values(search) as search 11 | | eval search=mvjoin(search, " OR ") 12 | 13 | $result.search$ 14 | 15 | 16 | 17 | 18 | $job.sid$ 19 | 20 | | tstats values(data.splunk_version) as current_splunk_version where $introspection$ earliest=-200min latest=-198min host=c0m1* component=hostwide by host 21 | | rex field=host "^(?:idx|c0m1)\-i\-[0-9a-f]+\.(?<stack>.*)\.(?:splunkcloud.com|splunkworks.lol)" 22 | $earliest$ 23 | $latest$ 24 | 25 |
26 | 27 | 28 | 29 | -4h@m 30 | now 31 | 32 | 33 | 34 | 35 | * 36 | 37 | 38 | 39 | label 40 | stack 41 | 42 | | inputlookup aws_inventory 43 | | search FQDN=*.$filter_stacks$.* 44 | | where stack!="null" 45 | | stats count by instance_type stack region role 46 | | lookup aws_instance_lookup.csv api_name as instance_type region 47 | | stats 48 | sum(vcpu) as total_vcpu 49 | sum(memory_gib) as total_memory 50 | sum(eval(price_per_hour_reserved*count)) as cost_per_hour 51 | sum(eval(if(role="indexer", storage_gib*count,0))) as total_cache_gib 52 | by stack 53 | | sort - cost_per_hour 54 | | eval label=stack." @ $".round(cost_per_hour,0)."/hr" 55 | -24h@h 56 | now 57 | 58 | * 59 | * 60 | 61 |
62 | 63 | 64 | Drilldown configuration - when you drill down we need to add to the time window, if you are drilling down to a RR, you relative padding (percentage of the duration) otherwise if you use absolute padding. 65 | 66 | 67 | 200 68 | 100 69 | 200 70 | 500 71 | 72 | 73 | 74 | exact 75 | 10% 76 | 25% 77 | 50% 78 | 100% 79 | 200% 80 | 0.1 81 | 82 | 83 | 84 | exact 85 | 10% 86 | 25% 87 | 50% 88 | 100% 89 | 200% 90 | 0.1 91 | .5 92 | 93 | 94 | 95 | 0 mins 96 | 5 sec 97 | 1 min 98 | 5 min 99 | 10 min 100 | 1 hour 101 | 5 102 | 103 | 104 | 105 | 0 mins 106 | 5 sec 107 | 1 min 108 | 5 min 109 | 10 min 110 | 1 hour 111 | 5 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 121 | $row.host$ 122 | $row.stack$ 123 | $row._earliest$ 124 | $row._latest$ 125 | $drilldown_latest$-$drilldown_earliest$ 126 | ceiling($drilldown_duration$/$resolution$) 127 | ceiling($drilldown_duration$/$resolution$)) 128 | if($selected_span_min_1sec$<10,10,$selected_span_min_1sec$) 129 | if($selected_span_min_1sec$<31,31,$selected_span_min_1sec$) 130 | if($selected_span_min_1sec$<60,60,$selected_span_min_1sec$) 131 | cluster_master_performance?form.resolution=$resolution$&form.duration_seconds=$drilldown_duration$&form.selected_span_min_1sec=$selected_span_min_1sec$&form.selected_span_min_10sec=$selected_span_min_10sec$&form.selected_span_min_30sec=$selected_span_min_30sec$&form.selected_span_min_60sec=$selected_span_min_60sec$&form.selected_stack=$drilldown_stack$&form.selected_host=$drilldown_host$&form.time.earliest=$drilldown_earliest$&form.time.latest=$drilldown_latest$ 132 | 133 | ]]> 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | Major events occuring in the cluster: Rolling restarts, CM Restarts, New CM, IDX crashes or CM crashes 143 | 144 | 145 | * 146 | 147 | 148 | 149 | label 150 | role_state 151 | 152 | 153 | 154 | * 155 | * 156 | 157 | 158 | 159 | | search role_state="$selected_event$" AND stack=$filter_stacks$ 160 | | eval 161 | duration=if(label="Rolling restart",last_time-_time, 0), 162 | started_str=strftime(_time,"%c"), 163 | ended_str=strftime(_time+duration,"%c"), 164 | earliest=if(label="Rolling restart", _time-duration*$rr_padding_before$, _time-$non_rr_before$), 165 | latest=if(label="Rolling restart", _time+duration+duration*$rr_padding_after$, _time+$non_rr_after$), 166 | duration_duration=latest-earliest, 167 | drilldown_earliest_str=strftime(earliest,"%m/%d/%Y:%H:%M:%S"), 168 | drilldown_latest_str=strftime(latest,"%m/%d/%Y:%H:%M:%S"), 169 | duration_str=tostring(duration,"duration"), 170 | duration_mins=round(duration/60), 171 | upgrade=if(last_version!='data.splunk_version', "upgrade", NULL) 172 | | sort - duration_mins 173 | | rex field=host "^(?:idx|c0m1)\-i\-[0-9a-f]+\.(?<stack>.*)\.(?:splunkcloud.com|splunkworks.lol)" 174 | | join stack type=left [| loadjob $splunk_versions_sid$] 175 | | table stack label current_splunk_version cost_per_hour duration duration_str drilldown_* earliest latest 176 | | rename earliest as _earliest 177 | | rename latest as _latest 178 | 179 | 180 | 181 | 182 | 183 | $row.host$ 184 | $row.stack$ 185 | $row._earliest$ 186 | $row._latest$ 187 | $drilldown_latest$-$drilldown_earliest$ 188 | ceiling($drilldown_duration$/$resolution$) 189 | ceiling($drilldown_duration$/$resolution$)) 190 | if($selected_span_min_1sec$<10,10,$selected_span_min_1sec$) 191 | if($selected_span_min_1sec$<31,31,$selected_span_min_1sec$) 192 | if($selected_span_min_1sec$<60,60,$selected_span_min_1sec$) 193 | cluster_master_performance?form.resolution=$resolution$&form.duration_seconds=$drilldown_duration$&form.selected_span_min_1sec=$selected_span_min_1sec$&form.selected_span_min_10sec=$selected_span_min_10sec$&form.selected_span_min_30sec=$selected_span_min_30sec$&form.selected_span_min_60sec=$selected_span_min_60sec$&form.selected_stack=$drilldown_stack$&form.selected_host=$drilldown_host$&form.time.earliest=$drilldown_earliest$&form.time.latest=$drilldown_latest$ 194 | 195 |
196 |
197 |
198 | 199 | 200 | 201 | 202 | | stats count by role_state label 203 | | rename label as event 204 | | eval label=event." (".count.")" 205 | 206 | 207 |
208 |
209 |
210 | 211 | 212 | cm_events_transaction 213 | 214 | Cluster master restarting - transactions 215 | 216 | | where 217 | (state="rr_start" and last_state="rr_end") 218 | OR (state="new" OR state="crash" OR state="restart") 219 | | eval label=case( 220 | state="crash" and role="cluster master", "CM crashed", 221 | state="crash" and role="indexer", "IDX crashed", 222 | state="new" and role="cluster master", "New CM", 223 | state="new" and role="indexer", "New IDX", 224 | state="restart" and role="cluster master", "CM restart", 225 | state="rr_start","Rolling restart"), 226 | role_state=role."_".state 227 | 228 | 229 | 230 |
231 |
232 |
233 | 234 | 235 | cm_events_streamstats 236 | 237 | Cluster master restarting - transactions 238 | 239 | | sort - _time 240 | | streamstats current=f 241 | last(state) as last_state 242 | last(_time) as last_time 243 | by host role 244 | 245 | 246 | 247 |
248 |
249 |
250 | 251 | 252 | cm_events 253 | 254 | Restarting components 255 | 256 | 257 | 258 | 259 | ($splunkd$ host=c0m1*.$selected_stack$.* 260 | (CASE(ServerConfig) CASE(My) CASE(is) CASE(GUID)) OR 261 | (rolling restart CASE(CMMaster) finished OR Starting)) 262 | OR 263 | ($splunkd$ host=idx*.$selected_stack$.* 264 | (CASE(ServerConfig) CASE(My) CASE(is) CASE(GUID)) newly) 265 | OR 266 | ($crash$ host=c0m1*.$selected_stack$.* OR host=idx*.$selected_stack$.* 267 | build CASE(Received) TERM(fatal) signal splunkd NOT(TERM(renamed:))) 268 | | eval state=case( 269 | searchmatch("Newly"),"new", 270 | searchmatch("GUID"),"restart", 271 | searchmatch("finished"), "rr_end", 272 | searchmatch("starting"), "rr_start", 273 | searchmatch("fatal"), "crash"), 274 | role=if(searchmatch("host=idx*"),"indexer","cluster master") 275 | | rex field=host "^[^\.]+\.(?<stack>[^\.]+)\." 276 | | table _time host stack role state 277 | 278 | $time.earliest$ 279 | $time.latest$ 280 | 281 | 282 | 283 | 284 |
285 |
286 |
287 |
288 | -------------------------------------------------------------------------------- /default/data/ui/views/home.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 |

Introduction

7 |

Welcome to Richard Morgan's collection of diagnostic dashboards. As a Principal SE Architect at Splunk working in the EMEA region for many years I have developed my own dashboards to debug and assess the correct and efficient working of Splunk. I use these dashboards in my day to day job and I am proud to share them with the wider community.

8 | 9 |

All these dashboards have been built with the following principles:

10 |
    11 |
  • Install by copy and paste - Customers struggle to install apps, every dashboard is self-contained and can be installed in isolation. This means no datamodels, no macros, no saved searches.
  • 12 |
  • Any search head will do - Not all users have access to the cluster master or MC, these frequently don't connect all clusters. Pick a search head that connects to the most instances.
  • 13 |
  • Reserve engineer infrastructure - Rather than rely on the correct configuration of Splunk, try and reverse engineer what is an indexer, search head etc by the messages that they generate.
  • 14 |
  • Use minimum permissions - In my experience the REST API isn't available to all users and cannot be relied on. Instead only use the lower common denominator - search. Your user only needs permission to the _internal and _introspection indexes for this suite to work.
  • 15 |
  • Be fast as possible - I hate waiting for results, every dashboard has been aggressively optimised to use the least amount of computation. Typically, these dashboards will run a single search on load, after this any manipulation of the data on screen reuses the existing data sitting on the search head.
  • 16 |
  • Show case the power of Splunk - Use the cool features like annotation and selections, auto hiding and do computation via tokens
  • 17 |
  • Inline documentation - help should be where you need it, i.e. on the dashboard you are using.
  • 18 |
  • Be information dense - Fit as much useful data as possible into a dashboard so that judgements can be made from a single pane of glass.
  • 19 |
  • Support workflow and transitions - Each dashboard allows you to transition to others while keeping your time range and selected concept, be it an index, a host, a search or a user.
  • 20 |
21 |

Catalogue

22 | You can access the dashboards and the sources from the links below. When installing the dashboards please make sure you get the names exactly right for the transition's to work. 23 |

24 | Debug incoming forwarders - download xml 25 |

26 |

This tool reads from the metrics about the forwarders connecting to your indexers. You can select an entire cluster or a single site and the tool will rank forwarders by their contribution of data. This is important as very common to have a single forwarder send a significant about of data to the cluster. In the extreme cases it is not uncommon for 90% of the data to come from just 5% of the forwarders. It is therefore very important to tune how this 5% to work efficiently and correctly. The remaining 95% of forwarders then become little more than rounding errors. This tool will allow you to browse the forwarder population and at a high level understand its contribution, its burstiness, maximum data rates how long it takes to sweep the entire cluster, the software build and OS.

27 |

28 | Debug Ingestion - download xml 29 |

30 |

This tool instruments all logging around the ingestion pipelines, including: pipeline utilization, load, parsing errors, blocking, connecting forwarders, channel creation, output performance. You can select a single pipeline and see metrics about the sources passing through it, identify problematic sources and drill backwards through the forwarding chain to the source.

31 | 32 | 33 |

34 | Event distribution measurement - download xml 35 |

36 |

This tool measures event distribution in your Splunk environments, select a site or a cluster to perform the analysis on, then you select a subset of the indexes within that site or cluster to measure. The dashboard then measure at regular intervals how events are distributed across the search peers. The distribution is then visualized in a series of charts. 37 |

38 | 39 | 40 | 41 |

42 | Event delay per index - download xml 43 |

44 |

This tool measures event delay at scale simply by comparing the indexed field "_indexTime" and _time using tstats. By default, the search targets very event received over the last second. Use this tool to find indexes which are receiving from the past or the future, and then drill in to find which specific hosts are responsible for the ill-timed events.

45 |

46 | Event delay for host - download xml 47 |

48 |

This tool is similar to "Event delay per index", but instead of measuring one second for all hosts, it measures one host for 24 hours (by default). Use this tool to understand if a host is sending data to the cluster in a timely manor.

49 | 50 | https://raw.githubusercontent.com/silkyrich/cluster_health_tools/master/default/data/ui/views/debug_replication.xml 51 | 52 |

53 | Debug replication delay - download xml 54 |

55 |

Blocking on indexers can be caused by a few factors, they are disk IO, saturated CPU, network latency and network throughput. Network throughput can be a problem in cloud environments where indexers are place in various racks around the data center and contention for network resources are in play, just because you have a 10Gbit interface doesn't mean you are going to get that between pairs of hosts. This dashboard measure the blocking reported by the indexers when sending to a remote host allowing you to surface problematic indexers. It then plots the delay by reporting indexer, the remote blocked indexers and the individual bucket. 56 | 57 | 58 |

59 | 60 | 61 |

62 | Bucket roll analysis - download xml 63 |

64 |

This tool allows you to understand and measure the efficiency of the bucket rolling across all indexes, and drills down into the behaviour of specific indexes. Select a bucket and it constructs a search to find what is in that bucket. Use this tool to understand and reduce the frequency of bucket rolls.

65 |

66 | Bucket roll analysis - download xml 67 |

68 |

Event distribution is critical to search workload distribution and the scale out of your environment. This tool measures how well event distribution is working in your environment. You can select multiple indexes and see how quickly randomisation is working.

69 | 70 | 71 |
72 |
73 |
74 | -------------------------------------------------------------------------------- /default/data/ui/views/internal_indexes_breakdown.xml: -------------------------------------------------------------------------------- 1 |
2 | 3 | 4 | index=_* 5 | index=_internal 6 | 7 | 8 | $role_index$ TERM(group=tcpin_connections) TERM(fwdType=uf) earliest=-60m latest=-58m 9 | | stats count by hostname 10 | | eval primary_role="universal_forwarder" 11 | | rename hostname as host 12 | | eval lower_host=lower(host) 13 | | fields - count 14 | | outputlookup host_to_roles.csv 15 | -24h@h 16 | now 17 | 18 | | noop 19 | 20 | 21 | 22 | $role_index$ earliest=-90min latest=now sourcetype=splunkd Metrics TERM(group=instance) TERM(name=instance) (deployment_client OR cluster_search_head OR search_head OR indexer OR cluster_slave OR search_peer OR license_master OR universal_forwarder) 23 | | stats 24 | count 25 | values(instance_roles) as roles 26 | values(index_cluster_label) as idx_cluster_name 27 | by instance_guid server_name 28 | | eval mv_roles=split(roles, ", ") 29 | | eval idx_cluster_name=if(idx_cluster_name="none", "", idx_cluster_name) 30 | | stats 31 | count(eval(mv_roles="universal_forwarder")) as role_universal_forwarder 32 | count(eval(mv_roles="indexer")) as role_indexer 33 | count(eval(mv_roles="search_peer")) as role_search_peer 34 | count(eval(mv_roles="cluster_search_head")) as role_cluster_search_head 35 | count(eval(mv_roles="search_head")) as role_search_head 36 | count(eval(mv_roles="cluster_slave")) as role_cluster_slave 37 | count(eval(mv_roles="kv_store")) as role_kv_store_role 38 | count(eval(mv_roles="cluster_master")) as role_cluster_master 39 | count(eval(mv_roles="license_master")) as role_license_master 40 | count(eval(mv_roles="deployment_server_master")) as role_deployment_server 41 | count(eval(mv_roles="deployment_client")) as role_deployment_client 42 | count(eval(mv_roles="shc_captain")) as role_shc_captain 43 | count(eval(mv_roles="shc_member")) as role_shc_member 44 | by instance_guid server_name roles idx_cluster_name 45 | | rename instance_guid as guid 46 | | rename server_name as host 47 | | eval primary_role=case( 48 | role_universal_forwarder>=1, "universal_forwarder", 49 | role_indexer>=1,"indexer", 50 | role_cluster_master>=1,"cluster_master", 51 | role_cluster_search_head>=1,"search_head_cluster", 52 | role_search_head>=1,"search_head", 53 | role_license_master>=1,"heavy_forwarder", 54 | role_deployment_server_master>=1,"deployment_server_master" 55 | ) 56 | | table host primary_role 57 | | outputlookup host_to_primary_role.csv 58 | -24h@h 59 | now 60 | 61 | | noop 62 | 63 | 64 | 65 | | tstats count where $internal_indexes$ by host index sourcetype 66 | | rex field=host "^(?<short_host>[^.]+)" 67 | | eval lower_host=lower(short_host) 68 | | lookup host_to_primary_role.csv host output primary_role as role_main 69 | | lookup host_to_roles.csv host output primary_role as role_tcp 70 | | lookup host_to_roles.csv host as short_host output primary_role as role_short 71 | | lookup host_to_roles.csv lower_host output primary_role as role_lower 72 | | eval all_roles="" 73 | | foreach role_* [| eval all_roles=if(isnotnull('<<FIELD>>'),mvappend(all_roles,'<<FIELD>>'),all_roles)] 74 | | fields - role_* 75 | | eval role=mvfilter(all_roles!="") 76 | | eval role=mvdedup(role) 77 | | fields - all_roles 78 | $role_done$ 79 | $tcp_done$ 80 | $time.earliest$ 81 | $time.latest$ 82 | 1 83 | 84 | 85 | 86 | | search $filter_indexes$ $filter_roles$ $filter_sourcetypes$ 87 | 88 | 89 | 90 | | stats count by sourcetype index role 91 | 92 |
93 | 94 | 95 | 96 | -60m@m 97 | now 98 | 99 | 100 | 101 | 102 | * 103 | label 104 | index 105 | 106 | | stats sum(count) as count by index 107 | | eval label=index." (".count.")" 108 | 109 | * 110 | , 111 | index IN ( 112 | ) 113 | 114 | 115 | 116 | * 117 | label 118 | role 119 | 120 | | stats sum(count) as count by role | eval label=role." (".count.")" 121 | 122 | * 123 | , 124 | role IN ( 125 | ) 126 | 127 | 128 | 129 | * 130 | label 131 | sourcetype 132 | 133 | | stats sum(count) as count by sourcetype 134 | | eval label=sourcetype." (".count.")" 135 | 136 | * 137 | , 138 | sourcetype IN ( 139 | ) 140 | 141 |
142 | 143 | 144 | Relative sizes of indexes 145 | 146 | 147 | | stats sum(count) by index 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | Relative numbers of roles 163 | 164 | 165 | | stats dc(host) by role 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | Relative volumes of sourcetypes 181 | 182 | 183 | | stats sum(count) by sourcetype 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | All hosts logging into the internal indexes 199 | 200 | 201 | | chart sum(count) as events by host index 202 | | eval total_events=0 203 | | foreach VALUE_* [| eval total_events=total_events+if(isnotnull('<<FIELD>>'),'<<FIELD>>',0) ] 204 | | sort - total_events 205 | | fields - total_events 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | What sourcetypes are populating each index? 217 | 218 | 219 | | stats sum(count) by index sourcetype 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | What roles are populating each index? 239 | 240 | 241 | | stats sum(count) by role index 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | All hosts logging into the internal indexes 258 | 259 | 260 | | stats sum(count) by role sourcetype 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | How does each role translate populate indexes? 278 | 279 | 280 | | stats sum(count) by role index 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 |
296 | -------------------------------------------------------------------------------- /default/data/ui/views/roll_your_own_tstats_acceleration.xml: -------------------------------------------------------------------------------- 1 |
2 | 3 | 4 | if((round(relative_time(now(), $time.latest$)-relative_time(now(), $time.earliest$))/400)<1,1,round((relative_time(now(), $time.latest$)-relative_time(now(), $time.earliest$))/$divisions$)) 5 | 6 | 7 | $search_prefix$ TERM($split_by$=*) 8 | 9 |
10 | 11 | 12 | 13 | -24h@h 14 | now 15 | 16 | 17 | if((round(relative_time(now(), $time.latest$)-relative_time(now(), $time.earliest$))/400)<1,1,round((relative_time(now(), $time.latest$)-relative_time(now(), $time.earliest$))/$divisions$)) 18 | 19 | 20 | 21 | 22 | 23 | 400 24 | 25 | if((round(relative_time(now(), $time.latest$)-relative_time(now(), $time.earliest$))/$divisions$)<1,1,round((relative_time(now(), $time.latest$)-relative_time(now(), $time.earliest$))/$divisions$)) 26 | 27 | 28 | 29 | 30 | 31 | index=itsi_summary 32 | 33 | 34 | 35 | | noop 36 | 37 | 38 | 39 | alert_severity 40 | 41 | 42 | 43 | 1 44 | 45 | 46 | 47 | 48 | -24h@h 49 | now 50 | 51 | 52 |
53 | 54 | 55 | 56 |

57 |
TSTATS is $improvement$ times faster
58 |

59 | 60 |
61 |
62 | 63 | 64 | 65 |

Summary

66 | If you have a data set that has variables that are in semantic form that does not contain major separators you can search for that term.
In this example we are able to apply the method to because the log files are of the form alert_severity=something.
This means you can search for TERM(alert_severity=something) and count the event which contain that term.
However if the log lines were alert_severity="something" we wouldn't be able to use the method because " is a major separator.
67 | In the example we have the traditional search and a tstats search that execute multiple tstats, one for each possible value of alert_severity.
As this page runs it compares the execution speed of the approaches.
68 | Note that approach works best when the cardinality is small, but it is very flexiable. For instance if you were dealing with percentages you might use wildcards for instance TERM(value=9*) 69 | 70 |
71 |
72 | 73 | 74 | Search returned $search_progress$ events in $search_runtime$ seconds 75 | 76 | 77 | 78 | $job.eventCount$ 79 | $job.runDuration$ 80 | 81 | 82 | round($search_runtime$/$tstats_runtime$,0) 83 | 84 | $search_prefix$ TERM($split_by$=*) $regex$ 85 | | timechart limit=50 span=$seconds_for_bin$sec count by $split_by$ 86 | $time.earliest$ 87 | $time.latest$ 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 |

The normal search

98 | $search_prefix$ TERM($split_by$=*)
99 | $regex$ 100 | | timechart span=$seconds_for_bin$sec count by $split_by$ 101 | 102 |
103 |
104 | 105 | 106 | 107 | Events matched 108 | 109 | $search_prefix$ TERM($split_by$=*) $regex$ 110 | $sampling_time.earliest$ 111 | $sampling_time.latest$ 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | $columns_dynamic$ 122 | 123 | 124 | $result.columns$ 125 | 126 | $search_prefix$ TERM($split_by$=*) 127 | | regex $split_by$!="\s" 128 | | stats dc($split_by$) as count values($split_by$) as columns 129 | | eval click="Generate tstats search for ".count." columns" 130 | | table click columns 131 | $sampling_time.earliest$ 132 | $sampling_time.latest$ 133 | 1 134 | 135 | 136 | 137 | 138 | $row.columns$ 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | $result.search$ 149 | 150 | | makeresults 151 | | eval columns="$columns$" 152 | | makemv delim="," columns 153 | | mvexpand columns 154 | | streamstats count 155 | | eval search=if(count=1, "| tstats prestats=t ", "| tstats prestats=t append=t ")." count where $search_prefix$ TERM($split_by$=".columns.") by _time span=$seconds_for_bin$sec | eval $split_by$=if(isnull($split_by$),\"".columns."\",$split_by$)" 156 | | sort + count 157 | | stats values(search) as search 158 | | eval search=mvjoin(search," ")."| timechart limit=50 span=$seconds_for_bin$sec count by $split_by$" 159 | -24h@h 160 | now 161 | 162 | 163 | 164 |
165 |
166 |
167 | 168 | 169 | Tstats returned $tstats_progress$ events in $tstats_runtime$ seconds 170 | 171 | 172 | 173 | $job.eventCount$ 174 | $job.runDuration$ 175 | 176 | $tstats_search$ 177 | $time.earliest$ 178 | $time.latest$ 179 | 1 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 |

The tstats search

218 | $tstats_search$
219 | | timechart span=$seconds_for_bin$sec count by alert_severity
220 |
221 |
222 |
223 | -------------------------------------------------------------------------------- /default/data/ui/views/search_performance_evaluator.xml: -------------------------------------------------------------------------------- 1 |
2 | 3 | | noop 4 | | noop 5 | 6 | 7 |
8 | 9 | 10 | 11 | -24h@h 12 | now 13 | 14 | 15 | 16 | 17 | | noop 18 | 19 |
20 | 21 | 22 | Entered search, optimized search, remote search, run duration 23 | 24 |

$search$

25 |

$optimizedSearch$

26 |

$remoteSearch$

27 |

$runDuration$ seconds

28 | 29 |
30 |
31 | 32 | 33 | 34 | 60 |

Search specification

61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 |
NameDescriptionValue
SPLThe search entered by the user$search$
Optimized SPLThe search post optimization$optimizedSearch$
KeywordsThe keywords found in SPL$keywords$
LISPYThe query on TSIDX$lispy$
88 | 89 |
90 | 91 | 92 |

Time range specification

93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 |
NameDescriptionValue
Relative earliestThe "from" time range$earliest_time$
Relative latestThe "to" time range$latest_time$
EPOCH earliestThe "from" time range$searchEarliestTime$
EPOCH latestThe "to" time range$searchLatestTime$
Duration of searchDuration in seconds$duration_seconds$
Duration of searchDuration in human terms$duration_str$
130 | 131 | 132 |
133 |
134 | 135 | 136 | 137 |

Bucket elimination performance

138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 |
NameDescriptionValue
consideredBucketsThe number of candidate buckets (index+timerange)$searchTotalBucketsCount$
eliminatedBucketsThe number of buckets eliminated via bloomfilters and metadata$searchTotalEliminatedBucketsCount$
eliminatedBuckets as pctThe number of buckets eliminated as percentage$searchTotalEliminatedBucketsCount_pct$ %
160 | 161 | 162 |
163 | 164 | 165 |

Post elimination, bucket searching performance

166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 |
NameDescriptionValue
scanCountHow many events were evaluated$scanCount$
eventCountHow many events were returned$eventCount$
dropCountHow many events were filtered out by schema on the fly$dropCount$
dropPercentageWhat percentage of events were dropped by schema on the fly$dropPerc$ %
194 | 195 | 196 |
197 |
198 | 199 | 200 | Events returned by search 201 | 202 | Edit this search to get performance breakdown 203 | 204 | 205 | $job.sid$ 206 | replace($job.sid$,"_"," ") 207 | 208 | $my_search$ 209 | $time.earliest$ 210 | $time.latest$ 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | | noop 220 | 221 | 222 |
223 |
224 |
225 | 226 | 227 | Job inspector for search 228 | 229 | 230 | | rest /services/search/jobs/$sid$ 231 | -24h@h 232 | now 233 | 234 | $result.search$ 235 | $result.indexEarliestTime$ 236 | $result.indexLatestTime$ 237 | round($result.searchEarliestTime$) 238 | round($result.searchLatestTime$) 239 | $searchLatestTime$-$searchEarliestTime$ 240 | tostring($searchLatestTime$-$searchEarliestTime$,"duration") 241 | $result.keywords$ 242 | $result.optimizedSearch$ 243 | $result.remoteSearch$ 244 | $result.searchTotalEliminatedBucketsCount$ 245 | $result.searchTotalBucketsCount$ 246 | round(100*$searchTotalEliminatedBucketsCount$/$searchTotalBucketsCount$,2) 247 | $result.scanCount$ 248 | $result.eventCount$ 249 | $result.scanCount$-$result.eventCount$ 250 | round(100*($result.scanCount$-$result.eventCount$)/$result.scanCount$,2) 251 | $result.request.earliest_time$ 252 | $result.request.latest_time$ 253 | 100*round($eventCount$/$scanCount$,4) 254 | round(100*$searchTotalEliminatedBucketsCount$/$searchTotalBucketsCount$,2) 255 | round(100*$searchTotalEliminatedBucketsCount$/$searchTotalBucketsCount$,2) 256 | round($result.runDuration$,2) 257 | round($result.performance.command.search.rawdata.invocations$,2) 258 | 259 | 260 | 261 |
262 |
263 |
264 | 265 | 266 | Get LISPY from search log 267 | 268 | 269 | | rest /services/search/jobs/$sid$/search.log 270 | | rex field=value "base lispy:(?<lispy>.*)" 271 | | fields + lispy 272 | -24h@h 273 | now 274 | 275 | $result.lispy$ 276 | 277 | 278 | 279 | 280 |
281 |
282 | 283 | 284 | 285 | | makeresults 286 | | eval start_time=tonumber(replace("$sid$",".*_(\d+)\.\d+.*","\1"),10)-2 287 | -24h@h 288 | now 289 | 290 | $result.start_time$ 291 | $sid_started$+$runDuration$+20 292 | 293 | 294 | 295 | 296 |
297 |
298 |
299 | 300 | 301 | Audit.log information for the search 302 | 303 | 304 | 305 | if($job.eventCount$ = 0,random(),$run_again$) 306 | 307 | index=_audit NOT TERM(ignore_me=) TERM(action=search) TERM(info=completed) earliest=$sid_started$ latest=now $sid$ 308 | | eval ignore_me=null() 309 | | rex field=_raw "search_id='(?<sid>[^']+)'" 310 | | where sid="$sid$" 311 | | table search_id target_index total_run_time 312 | event_count 313 | result_count 314 | available_count 315 | scan_count 316 | drop_count 317 | exec_time 318 | api_et 319 | api_lt 320 | search_et 321 | search_lt 322 | is_realtime 323 | search_startup_time 324 | searched_buckets 325 | eliminated_buckets 326 | considered_events 327 | total_slices 328 | decompressed_slices 329 | duration_command_search_index 330 | invocations_command_search_index_bucketcache_hit 331 | duration_command_search_index_bucketcache_hit 332 | invocations_command_search_index_bucketcache_miss 333 | duration_command_search_index_bucketcache_miss 334 | invocations_command_search_index_bucketcache_error 335 | duration_command_search_rawdata 336 | invocations_command_search_rawdata_bucketcache_hit 337 | duration_command_search_rawdata_bucketcache_hit 338 | invocations_command_search_rawdata_bucketcache_miss 339 | duration_command_search_rawdata_bucketcache_miss 340 | invocations_command_search_rawdata_bucketcache_error 341 | | eval run_again="$run_again$" 342 | | eval run_again="$run_again$" 343 | | transpose header_field=search_id column_name=search_id 344 | $time.earliest$ 345 | $time.latest$ 346 | 347 | 348 | 349 | 350 |
351 |
352 |
353 |
354 | -------------------------------------------------------------------------------- /default/searches.conf: -------------------------------------------------------------------------------- 1 | [Create forwarder inventory] 2 | action.cefout2.enabled = 0 3 | action.email.useNSSubject = 1 4 | alert.track = 0 5 | cron_schedule = 0 3 * * * 6 | dispatch.earliest_time = -4h@m 7 | dispatch.latest_time = now 8 | display.events.fields = ["host","source","sourcetype","index"] 9 | display.general.type = statistics 10 | display.page.search.mode = verbose 11 | display.page.search.tab = statistics 12 | display.visualizations.charting.chart = line 13 | display.visualizations.show = 0 14 | enableSched = 1 15 | request.ui_dispatch_app = search 16 | request.ui_dispatch_view = search 17 | schedule_window = 480 18 | search = | union \ 19 | [ search index=_internal Metrics (group=tcpin_connections OR group=tcpout_connections) sourcetype=splunkd \ 20 | | stats count(eval(group="tcpin_connections")) as tcp_in \ 21 | count(eval(group="tcpout_connections")) as tcp_out \ 22 | max(ingest_pipe) as pipelines dc(hostname) as clients by host \ 23 | | eval role=case(tcp_in>0 and tcp_out>0, "intermediate", tcp_in>0 and tcp_out=0, "indexer", tcp_in=0 and tcp_out>0, "endpoint") \ 24 | | fields + host role pipelines] \ 25 | [ search index=_internal Metrics group=tcpin_connections sourcetype=splunkd \ 26 | | stats values(os) as endpoint_os values(arch) as endpoint_arch values(build) as endpoint_build values(version) as endpoint_version values(ssl) as ssl values(ack) as ack values(fwdType) as fwdType by hostname \ 27 | | rename hostname as host] \ 28 | | stats values(*) as * by host\ 29 | | outputlookup hosts_to_roles.csv 30 | -------------------------------------------------------------------------------- /local/app.conf: -------------------------------------------------------------------------------- 1 | [ui] 2 | 3 | [launcher] 4 | version = .0 5 | -------------------------------------------------------------------------------- /local/data/ui/views/bucket_size_analysis: -------------------------------------------------------------------------------- 1 |
2 | 3 |
4 | 5 | 6 | auto 7 | auto_high_volume 8 | auto_high_volume 9 | 10 |
11 | 12 | 13 | Distribution of bucket sizes by index for auto_high_volume indexes (max size=10GB) 14 | 15 | 16 | 17 | 0 18 | 19 | 20 | 21 | 22 | 23 | | dbinspect 24 | [| rest /services/data/indexes 25 | | eval index=title 26 | | stats values(maxDataSize) as maxDataSize by index 27 | | where maxDataSize="$size$" 28 | | eval index="index=".index 29 | | stats values(index) as indexes 30 | | mvcombine delim=" " indexes 31 | | eval search=indexes ] 32 | | bin sizeOnDiskMB span=2log4 33 | | chart limit=0 count by sizeOnDiskMB index 34 | $time.earliest$ 35 | $time.latest$ 36 | 1 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | Span, bucket size and number of rolls per index as per "HotBucketRoller" for auto_high_volume indexes 68 | 69 | 70 | index=_internal source=*splunkd.log sourcetype=splunkd component=HotBucketRoller caller=* 71 | [| rest /services/data/indexes 72 | | eval index=title 73 | | stats values(maxDataSize) as maxDataSize by index 74 | | where maxDataSize="$size$" 75 | | eval index="idx=\"".index."\"" 76 | | stats values(index) as indexes 77 | | eval search="(".mvjoin(indexes," OR ").")" ] 78 | | eval makeEpochs=split(to,"_") 79 | | eval a=mvindex(makeEpochs,1),b=mvindex(makeEpochs,2),c=a-b, idx=idx+":"+caller 80 | | eval c=c/86400 81 | | stats avg(c) As BucketSpanDays, avg(size) As BucketSizeGb, count AS BucketRolls by idx 82 | | eval BucketSizeGb=round(((BucketSizeGb/1024)/1024)/1024,3) 83 | $time.earliest$ 84 | $time.latest$ 85 | 1 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 |
115 | -------------------------------------------------------------------------------- /local/data/ui/views/bursting_forwarders_and_indexing_delay.xml: -------------------------------------------------------------------------------- 1 |
2 | 3 | 4 | 5 | [ search 6 | [ search _index_earliest=-60m@m _index_latest=now index=_internal * sourcetype=splunkd source="*metrics.log" component=Metrics group=per_host_thruput 7 | [| inputcsv hosts_to_roles.csv 8 | | where role="intermediate" 9 | | eval host="host=".host 10 | | stats values(host) as hosts 11 | | eval search="(".mvjoin(hosts, " OR ").")"] 12 | | stats max(kbps) as max_kbps by series 13 | | where max_kbps > 4800 14 | | stats values(series) as host 15 | | eval search="(index=_internal sourcetype::splunkd source::/opt/splunkforwarder/var/log/splunk/metrics.log per_index_thruput host=*) AND (".mvjoin(host," OR host::").")"] 16 | | stats max(kbps) as max_kbps by series host 17 | | where max_kbps > 4800 18 | | eval search="(index=".series." AND host=".host.")" 19 | | fields - max_kbps series host 20 | | stats values(search) as search 21 | | eval search="_index_earliest=-60m@m _index_latest=now AND (".mvjoin(search," OR ").")"] 22 | | noop sample_ratio=10000 23 | 24 | $time.earliest$ 25 | $time.latest$ 26 | 27 |
28 | 29 | 30 | 31 | -4h@m 32 | now 33 | 34 | 35 |
36 | 37 | 38 | 39 | Delay by timestamp 40 | 41 | 42 | | eval delay=_indextime-_time, series=host." indexing to ".index 43 | | timechart limit=50 max(delay) by series 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | Delay by time indexed 58 | 59 | 60 | | eval delay=_indextime-_time, _time=_indextime, series=host." indexing to ".index 61 | | timechart limit=50 max(delay) by series 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 |
-------------------------------------------------------------------------------- /local/data/ui/views/event_distribution_measurements.xml: -------------------------------------------------------------------------------- 1 |
2 | 3 | 4 | 5 | $generated_search$ 6 | 7 | 8 |
9 | 10 | 11 | label 12 | index 13 | 14 | | eventcount index=* index=_* summarize=false | stats sum(count) as size by index | sort - size | eval label=index."(".size.")" 15 | -24h@h 16 | now 17 | 18 | main 19 | ( 20 | ) 21 | index= 22 | OR 23 | 24 | 25 | 26 | label 27 | host 28 | 29 | | tstats count where $selected_index$ by host | eval label=host."(".count.")" 30 | -24h@h 31 | now 32 | 33 | ( 34 | ) 35 | host= 36 | OR 37 | All 38 | * 39 | 40 | 41 | 42 | 10 43 | 44 | 45 | 46 | 4 47 | 48 | 49 | 50 | 60 51 | 60 52 | 53 |
54 | 55 | 56 | 6. Select on a column to select the number of samples we will take, best keep it to below 10 57 | 58 | 59 | 10 60 | 61 | 62 | Rate of time increase in seconds 63 | 64 | | makeresults 65 | | eval step=mvrange(1,$max_samples_to_offer$,1) 66 | | mvexpand step 67 | | eval "time range in seconds"=$step_size$*pow(step,$power$) 68 | | fields step "time range in seconds" 69 | | fields - _time 70 | -24h@h 71 | now 72 | 73 | 74 | 75 | 76 | $click.value$ 77 | 78 | 79 | 80 | 81 | 7. Click on the duration to execute the search 82 | 83 | 84 | host 85 | index 86 | index 87 | 88 | 89 | The format is "days+hours:minutes:seconds" i.e 10+6:30:00 = 10 days and 6 hours 90 | 91 | | makeresults 92 | | eval step=mvrange(1,$steps$,1) 93 | | mvexpand step 94 | | eval step_size=$step_size$*pow(step,$power$) 95 | | eval jump=step_size+60 96 | | fields step_size jump step 97 | | eval tstats_preamble=if(step==1,"| tstats prestats=t","| tstats prestats=t append=t") 98 | | eval tstats_search=" 99 | ".tstats_preamble." count max(_time) as latest_time min(_time) as min_time where earliest=-".jump."sec latest=-$offset$sec $selected_host$ $selected_index$ by splunk_server $split_by$ 100 | | eval period=if(isNull(period),\"".step_size."\",period) 101 | ", step_string=tostring(step_size,"duration") 102 | | fields - jump tstats_preamble 103 | | stats last(step_string) as max_history list(*) as * 104 | | eval tstats_search="[".mvjoin(tstats_search," ")." | stats count by period splunk_server index ]" 105 | | eventstats last(step_string) as max_history_str last(step_size) as max_history 106 | | eval step_string=mvjoin(step_string,","), step_size=mvjoin(step_size,","), step=mvjoin(step,",") 107 | | eval post_process="| stats sum(count) as count by period splunk_server index" 108 | | eval padding_search="[| tstats prestats=t count where earliest=-".max_history."sec latest=-60sec $selected_host$ $selected_index$ by splunk_server $split_by$ 109 | | stats count by splunk_server index 110 | | eval count=0 111 | | eval period=\"".step_size."\" 112 | | makemv delim=\",\" period 113 | | mvexpand period]" 114 | | eval search="| union ".padding_search.tstats_search.post_process 115 | | fields max_history_str search step_string 116 | -24h@h 117 | now 118 | 119 | 120 | 121 | $row.search$ 122 | 123 | 124 | 125 | 126 | 127 | 128 | What was the normalised standard deviation over time? 129 | 130 | Until all indexers have received data the standard deviation cannot be calculated 131 | 132 | | stats avg(count) as avg stdev(count) as stdev by period $split_by$ 133 | | eval ratio=stdev/avg 134 | | fields - stdev avg 135 | | chart limit=100 values(ratio) as ratio by period $split_by$ 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | How many indexers received data during each period? 145 | 146 | We need this to ramp up as quickly as possible 147 | 148 | | where count!=0 149 | | chart dc(splunk_server) by period $split_by$ 150 | | sort period 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | Events scanned in each step 160 | 161 | Events scanned goes up quickly 162 | 163 | | chart limit=100 sum(count) as ratio by period $split_by$ 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | Variation of events across the indexers $selected_index$ 176 | 177 | 178 | index 179 | index 180 | 181 | | rest /services/data/indexes 182 | | rename title as index 183 | | fields + index 184 | | search $selected_index$ 185 | | stats count by index 186 | | fields - count 187 | 188 | * 189 | ( 190 | ) 191 | index=" 192 | " 193 | OR 194 | * 195 | 196 | 197 | Variation of events across the indexers 198 | 199 | | search $filter_index$ 200 | | stats count by period splunk_server 201 | | xyseries splunk_server period count 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 |
212 | -------------------------------------------------------------------------------- /local/data/ui/views/intermediate_forwarders_switching_efficiency_analysis.xml: -------------------------------------------------------------------------------- 1 |
2 | 3 |
4 | 5 | 6 | 7 | -4h@h 8 | now 9 | 10 | 11 | 12 | 13 | 90 14 | 15 |
16 | 17 | 18 | What percentage of the pool is an intermediate forwarder not connecting too 19 | 20 | Once an forwarder hits the bozo bit it gives up on the indexer for 30 sec * size of the pool. We can see that forwarders are shrinking their pool and refreshing again. Red is bad, low numbers are better 21 | 22 | index="_internal" sourcetype=splunkd TcpOutputProcessor connect_try 23 | [| inputcsv hosts_to_roles.csv 24 | | where role="intermediate" 25 | | eval host="host=".host 26 | | stats values(host) as hosts 27 | | eval search="(".mvjoin(hosts, " OR ").")"] 28 | | rex field=_raw "destIp=(?<destIp>[^,]+)," 29 | | lookup ip_to_indexer_name.csv ips as destIp output indexer_name as target_indexer 30 | | rex field=target_indexer "idx(?<idx>\d+).ppbf.splunkcloud.com" 31 | | eval destIp=if(idx!="",idx,destIp) 32 | | timechart limit=0 span=10min dc(destIp) as pool by host 33 | | foreach * 34 | [| eval <<FIELD>>=((1-round(('<<FIELD>>'/90),2))*100)] 35 | $time.earliest$ 36 | $time.latest$ 37 | 38 | 39 | 40 | 41 |
42 |
43 |
44 | 45 | 46 | Forwarder removing stale URLs to indexer id 47 | 48 | When a forwarder gives up on an indexer it takes a few hours before it reconnects to it again. 49 | 50 | index="_internal" host=*suf* sourcetype=splunkd TcpOutputProc removing stale url 51 | | rex field=_raw "stale url : (?<destIp>[^:]+):" 52 | | lookup ip_to_indexer_name.csv ips as destIp output indexer_name as target_indexer 53 | | rex field=target_indexer "idx(?<idx>\d+).ppbf.splunkcloud.com" 54 | | rex field=host "^(?<short_name>.*)(-prd\.prd\.betfair)|(.betfair)" 55 | | timechart limit=0 values(idx) by short_name 56 | | foreach * [| eval <<FIELD>>=mvjoin('<<FIELD>>',", ")] 57 | $time.earliest$ 58 | $time.latest$ 59 | 1 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 |
69 |
70 |
71 |
-------------------------------------------------------------------------------- /local/data/ui/views/top_data_generating_source_forwarder_analysis.xml: -------------------------------------------------------------------------------- 1 |
2 | 3 |
4 | 5 | 6 | 7 | -60m@m 8 | now 9 | 10 | 11 | 12 | 13 | * 14 | 15 | 16 | 17 | 50 18 | 19 |
20 | 21 | 22 | 23 | 24 | 200 25 | 26 | 27 | Top $number_forwarders$ source forwarders and their average event streams over $max_kbps_sourcetype_avg$ kbps by sourcetype 28 | 29 | [search index=_internal host=$forwarder_name$ sourcetype=splunkd source="/opt/splunkforwarder/var/log/splunk/metrics.log" component=Metrics group=per_host_thruput 30 | | stats avg(kbps) as max_kbps by series 31 | | sort - max_kbps 32 | | head $number_forwarders$ 33 | | stats values(series) as host 34 | | eval search="(index=_internal sourcetype::splunkd source::/opt/splunkforwarder/var/log/splunk/metrics.log per_sourcetype_thruput host=$forwarder_name$) AND (host::".mvjoin(host," OR host::").")"] 35 | | stats avg(kbps) as max_kbps by series host 36 | | where max_kbps > $max_kbps_sourcetype_avg$ 37 | | xyseries host series max_kbps 38 | | addtotals 39 | | sort - Total 40 | | fields - Total 41 | $time.earliest$ 42 | $time.latest$ 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 200 55 | 56 | 57 | Top $number_forwarders$ source forwarders with maximum throughput over $max_kbps_sourcetype_max$ kbps 58 | 59 | [search index=_internal host=$forwarder_name$ sourcetype=splunkd source="/opt/splunkforwarder/var/log/splunk/metrics.log" component=Metrics group=per_host_thruput 60 | | stats max(kbps) as max_kbps by series 61 | | sort - max_kbps 62 | | head $number_forwarders$ 63 | | stats values(series) as host 64 | | eval search="(index=_internal sourcetype::splunkd source::/opt/splunkforwarder/var/log/splunk/metrics.log per_sourcetype_thruput host=$forwarder_name$) AND (host::".mvjoin(host," OR host::").")"] 65 | | stats max(kbps) as max_kbps by series host 66 | | where max_kbps > $max_kbps_sourcetype_max$ 67 | | xyseries host series max_kbps 68 | | addtotals 69 | | sort - Total 70 | | fields - Total 71 | $time.earliest$ 72 | $time.latest$ 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 200 87 | 88 | 89 | Top $number_forwarders$ source forwarders and their average event streams over $max_kbps_source$ kbps by source 90 | 91 | [search index=_internal host=$forwarder_name$ sourcetype=splunkd source="/opt/splunkforwarder/var/log/splunk/metrics.log" component=Metrics group=per_host_thruput 92 | | stats avg(kbps) as max_kbps by series 93 | | sort - max_kbps 94 | | head $number_forwarders$ 95 | | stats values(series) as host 96 | | eval search="(index=_internal sourcetype::splunkd source::/opt/splunkforwarder/var/log/splunk/metrics.log per_source_thruput host=$forwarder_name$) AND (host::".mvjoin(host," OR host::").")"] 97 | | stats avg(kbps) as max_kbps by series host 98 | | where max_kbps > $max_kbps_source$ 99 | | xyseries host series max_kbps 100 | | addtotals 101 | | sort - Total 102 | | fields - Total 103 | $time.earliest$ 104 | $time.latest$ 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 200 117 | 118 | 119 | Top $number_forwarders$ source forwarders with maximum throughput over $max_kbps_index$ kbps by index 120 | 121 | [search index=_internal host=$forwarder_name$ sourcetype=splunkd source="/opt/splunkforwarder/var/log/splunk/metrics.log" component=Metrics group=per_host_thruput 122 | | stats max(kbps) as max_kbps by series 123 | | sort - max_kbps 124 | | head $number_forwarders$ 125 | | stats values(series) as host 126 | | eval search="(index=_internal sourcetype::splunkd source::/opt/splunkforwarder/var/log/splunk/metrics.log per_index_thruput host=$forwarder_name$) AND (host::".mvjoin(host," OR host::").")"] 127 | | stats max(kbps) as max_kbps by series host 128 | | where max_kbps > $max_kbps_index$ 129 | | xyseries host series max_kbps 130 | | addtotals 131 | | sort - Total 132 | | fields - Total 133 | $time.earliest$ 134 | $time.latest$ 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 |
145 | -------------------------------------------------------------------------------- /local/savedsearches.conf: -------------------------------------------------------------------------------- 1 | [Create endpoint classification table] 2 | action.cefout2.enabled = 0 3 | action.email.useNSSubject = 1 4 | alert.track = 0 5 | cron_schedule = 0 5 * * * 6 | dispatch.earliest_time = -24h@h 7 | dispatch.latest_time = now 8 | dispatch.sample_ratio = 1000 9 | display.general.type = statistics 10 | display.page.search.tab = statistics 11 | display.visualizations.show = 0 12 | enableSched = 1 13 | request.ui_dispatch_app = event_distribution_tools 14 | request.ui_dispatch_view = search 15 | search = | union \ 16 | [ search index=_internal Metrics (group=tcpin_connections OR group=tcpout_connections) sourcetype=splunkd \ 17 | | stats count(eval(group="tcpin_connections")) as tcp_in \ 18 | count(eval(group="tcpout_connections")) as tcp_out \ 19 | max(ingest_pipe) as pipelines dc(hostname) as clients by host \ 20 | | eval role=case(tcp_in>0 and tcp_out>0, "intermediate", tcp_in>0 and tcp_out=0, "indexer", tcp_in=0 and tcp_out>0, "endpoint") \ 21 | | fields + host role pipelines] \ 22 | [ search index=_internal Metrics group=tcpin_connections sourcetype=splunkd \ 23 | | stats values(os) as endpoint_os values(arch) as endpoint_arch values(build) as endpoint_build values(version) as endpoint_version values(ssl) as ssl values(ack) as ack values(fwdType) as fwdType by hostname \ 24 | | rename hostname as host] \ 25 | | stats values(*) as * by host\ 26 | | outputcsv hosts_to_roles.csv 27 | -------------------------------------------------------------------------------- /metadata/default.meta: -------------------------------------------------------------------------------- 1 | 2 | # Application-level permissions 3 | 4 | [] 5 | access = read : [ * ], write : [ admin, power ] 6 | 7 | ### EVENT TYPES 8 | 9 | [eventtypes] 10 | export = system 11 | 12 | 13 | ### PROPS 14 | 15 | [props] 16 | export = system 17 | 18 | 19 | ### TRANSFORMS 20 | 21 | [transforms] 22 | export = system 23 | 24 | 25 | ### LOOKUPS 26 | 27 | [lookups] 28 | export = system 29 | 30 | 31 | ### VIEWSTATES: even normal users should be able to create shared viewstates 32 | 33 | [viewstates] 34 | access = read : [ * ], write : [ * ] 35 | export = system 36 | -------------------------------------------------------------------------------- /metadata/local.meta: -------------------------------------------------------------------------------- 1 | [app/ui] 2 | version = 7.0.1 3 | modtime = 1523794851.944065000 4 | 5 | [app/launcher] 6 | version = 7.0.1 7 | modtime = 1523794851.948698000 8 | 9 | [savedsearches/Create%20endpoint%20classification%20table] 10 | access = read : [ * ], write : [ admin ] 11 | export = none 12 | owner = admin 13 | version = 7.0.1 14 | modtime = 1523796757.199542000 15 | 16 | [views/bursting_forwarders_and_indexing_delay] 17 | access = read : [ * ], write : [ admin ] 18 | export = none 19 | owner = admin 20 | version = 7.0.1 21 | modtime = 1523796790.162265000 22 | 23 | [views/event_distribution_measurements] 24 | access = read : [ * ], write : [ admin ] 25 | export = none 26 | owner = admin 27 | version = 7.0.1 28 | modtime = 1523796799.508039000 29 | 30 | [views/top_data_generating_source_forwarder_analysis] 31 | access = read : [ * ], write : [ admin ] 32 | export = none 33 | owner = admin 34 | version = 7.0.1 35 | modtime = 1523796810.220912000 36 | 37 | [views/intermediate_forwarders_switching_efficiency_analysis] 38 | access = read : [ * ], write : [ admin ] 39 | export = none 40 | owner = admin 41 | version = 7.0.1 42 | modtime = 1523796831.913665000 43 | -------------------------------------------------------------------------------- /vcpu_pricing: -------------------------------------------------------------------------------- 1 | please download from 2 | 3 | https://github.com/silkyrich/cluster_health_tools/blob/master/default/data/ui/views/vcpu_infrastructure_sizing.xml 4 | --------------------------------------------------------------------------------