├── README.md ├── ceph-status.sh ├── ceph_cron.txt └── zabbix_templates ├── zbx_ceph_cluster_template.xml ├── zbx_ceph_cluster_template_v1_format.xml ├── zbx_ceph_mds_template.xml ├── zbx_ceph_mon_template.xml └── zbx_ceph_osd_template.xml /README.md: -------------------------------------------------------------------------------- 1 | =========== 2 | 主要改进: 3 | 4 | 1. 采用 zabbix-agent(active) 模式,效率更高 5 | 6 | 2. 采集脚本多机部署,数据发送到同一HOST,避免单点风险 7 | 8 | 3. 增加支持 read/write iops item; 9 | 10 | 4. 增加iops超高和ceph err告警 11 | 12 | =========== 13 | 14 | 15 | ceph-zabbix-active 16 | =========== 17 | 18 | Zabbix active plugin for Ceph monitoring 19 | 20 | Installation 21 | =========== 22 | 23 | A. On one Ceph Monitor Host: 24 | 25 | 1. git clone https://github.com/BodihTao/ceph-zabbix.git 26 | 27 | 2. cd ceph-zabbix 28 | 29 | 3. sudo cp ceph-status.sh /etc/zabbix/scripts/ 30 | 31 | 4. edit ceph_cron.txt to set the serverActiveIP and ZabbixHost to push 32 | 33 | 5. sudo cp ceph_cron.txt /etc/cron.d/ 34 | 35 | On the other Monitor Hosts: 36 | the same with A step. 37 | then if one monitor host is down, the ceph-cluser zabbix monitor will work fine. 38 | 39 | 40 | B. On Zabbix DashBoad: 41 | 42 | 1. import the xml templates. 43 | 44 | 2. Link the ceph templates to your ZabbixHost 45 | 46 | 47 | 48 | 49 | 50 | ============== 51 | 52 | agent send directly from the script to zabbix trough zabbix-trapper 53 | -------------------------------------------------------------------------------- /ceph-status.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ceph_bin="/usr/bin/ceph" 4 | rados_bin="/usr/bin/rados" 5 | zabbix_sender_bin="/usr/bin/zabbix_sender" 6 | 7 | 8 | # Initialising variables 9 | # See: http://ceph.com/docs/master/rados/operations/pg-states/ 10 | creating=0 11 | active=0 12 | clean=0 13 | down=0 14 | replay=0 15 | splitting=0 16 | scrubbing=0 17 | degraded=0 18 | inconsistent=0 19 | peering=0 20 | repair=0 21 | recovering=0 22 | backfill=0 23 | waitBackfill=0 24 | incomplete=0 25 | stale=0 26 | remapped=0 27 | 28 | # Get data 29 | pginfo=$(echo -n " pgmap $($ceph_bin pg stat)" | sed -n "s/.*pgmap/pgmap/p") 30 | pgtotal=$(echo $pginfo | cut -d':' -f2 | sed 's/[^0-9]//g') 31 | pgstats=$(echo $pginfo | cut -d':' -f3 | cut -d';' -f1| sed 's/ /\\ /g') 32 | pggdegraded=$(echo $pginfo | sed -n '/degraded/s/.* degraded (\([^%]*\)%.*/\1/p') 33 | if [[ "$pggdegraded" == "" ]] 34 | then 35 | pggdegraded=0 36 | fi 37 | # unfound (0.004%) 38 | pgunfound=$(echo $pginfo | cut -d';' -f2|sed -n '/unfound/s/.*unfound (\([^%]*\)%.*/\1/p') 39 | if [[ "$pgunfound" == "" ]] 40 | then 41 | pgunfound=0 42 | fi 43 | 44 | 45 | clientio=$($ceph_bin -s |grep "client io") 46 | # read kbps B/s 47 | rdbps=$(echo $clientio | sed -n '/client/s/.* \([0-9]* .\?\)B\/s rd.*/\1/p' | sed -e "s/K/*1000/ig;s/M/*1000*1000/i;s/G/*1000*1000*1000/i;s/E/*1000*1000*1000*1000/i" | bc) 48 | # write kbps B/s 49 | wrbps=$(echo $clientio | sed -n '/client/s/.* \([0-9]* .\?\)B\/s wr.*/\1/p' | sed -e "s/K/*1000/ig;s/M/*1000*1000/i;s/G/*1000*1000*1000/i;s/E/*1000*1000*1000*1000/i" | bc) 50 | # read kbps B/s 51 | #rdbps=$(echo $pginfo | sed -n '/pgmap/s/.* \([0-9]* .\?\)B\/s rd.*/\1/p' | sed -e "s/K/*1000/ig;s/M/*1000*1000/i;s/G/*1000*1000*1000/i;s/E/*1000*1000*1000*1000/i" | bc) 52 | if [[ "$rdbps" == "" ]] 53 | then 54 | rdbps=0 55 | fi 56 | 57 | # write kbps B/s 58 | #wrbps=$(echo $pginfo | sed -n '/pgmap/s/.* \([0-9]* .\?\)B\/s wr.*/\1/p' | sed -e "s/K/*1000/ig;s/M/*1000*1000/i;s/G/*1000*1000*1000/i;s/E/*1000*1000*1000*1000/i" | bc) 59 | if [[ "$wrbps" == "" ]] 60 | then 61 | wrbps=0 62 | fi 63 | 64 | # ops 65 | rops=$(echo $clientio | sed -n '/client/s/.* \([0-9]* k\?\)op\/s rd.*/\1/p'|sed -e "s/K/*1000/ig"|bc) 66 | if [[ "$rops" == "" ]] 67 | then 68 | rops=0 69 | fi 70 | 71 | wops=$(echo $clientio | sed -n '/client/s/.* \([0-9]* k\?\)op\/s wr.*/\1/p'|sed -e "s/K/*1000/ig"|bc) 72 | if [[ "$wops" == "" ]] 73 | then 74 | wops=0 75 | fi 76 | 77 | ops=$(echo $rops + $wops | bc) 78 | 79 | #ops=$(echo $pginfo | sed -n '/pgmap/s/.* \([0-9]*\) op\/s.*/\1/p') 80 | #if [[ "$ops" == "" ]] 81 | #then 82 | # ops=0 83 | #fi 84 | 85 | 86 | # Explode array 87 | IFS=', ' read -a array <<< "$pgstats" 88 | for element in "${array[@]}" 89 | do 90 | element=$(echo "$element" | sed 's/^ *//g') 91 | # Get elements 92 | number=$(echo $element | cut -d' ' -f1) 93 | data=$(echo $element | cut -d' ' -f2) 94 | 95 | # Agregate data 96 | if [ "$(echo $data | grep creating | wc -l)" == 1 ] 97 | then 98 | creating=$(echo $creating+$number|bc) 99 | fi 100 | 101 | if [ "$(echo $data | grep active | wc -l)" == 1 ] 102 | then 103 | active=$(echo $active+$number|bc) 104 | fi 105 | 106 | if [ "$(echo $data | grep clean | wc -l)" == 1 ] 107 | then 108 | clean=$(echo $clean+$number|bc) 109 | fi 110 | 111 | if [ "$(echo $data | grep down | wc -l)" == 1 ] 112 | then 113 | down=$(echo $down+$number|bc) 114 | fi 115 | 116 | if [ "$(echo $data | grep replay | wc -l)" == 1 ] 117 | then 118 | replay=$(echo $replay+$number|bc) 119 | fi 120 | 121 | if [ "$(echo $data | grep splitting | wc -l)" == 1 ] 122 | then 123 | splitting=$(echo $splitting+$number|bc) 124 | fi 125 | 126 | if [ "$(echo $data | grep scrubbing | wc -l)" == 1 ] 127 | then 128 | scrubbing=$(echo $scrubbing+$number|bc) 129 | fi 130 | 131 | if [ "$(echo $data | grep degraded | wc -l)" == 1 ] 132 | then 133 | degraded=$(echo $degraded+$number|bc) 134 | fi 135 | 136 | if [ "$(echo $data | grep inconsistent | wc -l)" == 1 ] 137 | then 138 | inconsistent=$(echo $inconsistent+$number|bc) 139 | fi 140 | 141 | if [ "$(echo $data | grep peering | wc -l)" == 1 ] 142 | then 143 | peering=$(echo $peering+$number|bc) 144 | fi 145 | 146 | if [ "$(echo $data | grep repair | wc -l)" == 1 ] 147 | then 148 | repair=$(echo $repair+$number|bc) 149 | fi 150 | 151 | if [ "$(echo $data | grep recovering | wc -l)" == 1 ] 152 | then 153 | recovering=$(echo $recovering+$number|bc) 154 | fi 155 | 156 | if [ "$(echo $data | grep backfill | wc -l)" == 1 ] 157 | then 158 | backfill=$(echo $backfill+$number|bc) 159 | fi 160 | 161 | if [ "$(echo $data | grep "wait-backfill" | wc -l)" == 1 ] 162 | then 163 | waitBackfill=$(echo $waitBackfill+$number|bc) 164 | fi 165 | 166 | if [ "$(echo $data | grep incomplete | wc -l)" == 1 ] 167 | then 168 | incomplete=$(echo $incomplete+$number|bc) 169 | fi 170 | 171 | if [ "$(echo $data | grep stale | wc -l)" == 1 ] 172 | then 173 | stale=$(echo $stale+$number|bc) 174 | fi 175 | 176 | if [ "$(echo $data | grep remapped | wc -l)" == 1 ] 177 | then 178 | remapped=$(echo $remapped+$number|bc) 179 | fi 180 | done 181 | 182 | ceph_osd_count=$($ceph_bin osd dump |grep "^osd"| wc -l) 183 | 184 | function ceph_osd_up_percent() 185 | { 186 | OSD_DOWN=$($ceph_bin osd dump |grep "^osd"| awk '{print $1 " " $2 " " $3}'|grep up|wc -l) 187 | COUNT=$(echo "scale=2; $OSD_DOWN*100/$ceph_osd_count" |bc) 188 | if [[ "$COUNT" != "" ]] 189 | then 190 | echo $COUNT 191 | else 192 | echo "0" 193 | fi 194 | } 195 | 196 | function ceph_osd_in_percent() 197 | { 198 | OSD_DOWN=$($ceph_bin osd dump |grep "^osd"| awk '{print $1 " " $2 " " $3}'|grep in|wc -l) 199 | COUNT=$(echo "scale=2; $OSD_DOWN*100/$ceph_osd_count" | bc) 200 | if [[ "$COUNT" != "" ]] 201 | then 202 | echo $COUNT 203 | else 204 | echo "0" 205 | fi 206 | } 207 | 208 | function ceph_mon_get_active() 209 | { 210 | ACTIVE=$($ceph_bin status|sed -n '/monmap/s/.* \([0-9]*\) mons.*/\1/p') 211 | if [[ "$ACTIVE" != "" ]] 212 | then 213 | echo $ACTIVE 214 | else 215 | echo 0 216 | fi 217 | } 218 | 219 | function ceph_get() 220 | { 221 | # Return the value 222 | case $1 in 223 | health_detail) 224 | $ceph_bin health 225 | ;; 226 | health) 227 | status=$($ceph_bin health | awk '{print $1}') 228 | case $status in 229 | HEALTH_OK) 230 | echo 1 231 | ;; 232 | HEALTH_WARN) 233 | echo 2 234 | ;; 235 | HEALTH_ERR) 236 | echo 3 237 | ;; 238 | *) 239 | echo -1 240 | ;; 241 | esac 242 | ;; 243 | rados_total) 244 | $rados_bin df | grep "total space"| awk '{print $3}' 245 | ;; 246 | rados_used) 247 | $rados_bin df | grep "total used"| awk '{print $3}' 248 | ;; 249 | rados_free) 250 | $rados_bin df | grep "total avail"| awk '{print $3}' 251 | ;; 252 | mon) 253 | ceph_mon_get_active 254 | ;; 255 | count) 256 | echo $ceph_osd_count 257 | ;; 258 | up) 259 | ceph_osd_up_percent 260 | ;; 261 | "in") 262 | ceph_osd_in_percent 263 | ;; 264 | degraded_percent) 265 | echo $pggdegraded 266 | ;; 267 | pgtotal) 268 | echo $pgtotal 269 | ;; 270 | creating) 271 | echo $creating 272 | ;; 273 | active) 274 | echo $active 275 | ;; 276 | clean) 277 | echo $clean 278 | ;; 279 | down) 280 | echo $down 281 | ;; 282 | replay) 283 | echo $replay 284 | ;; 285 | splitting) 286 | echo $splitting 287 | ;; 288 | scrubbing) 289 | echo $scrubbing 290 | ;; 291 | degraded) 292 | echo $degraded 293 | ;; 294 | inconsistent) 295 | echo $inconsistent 296 | ;; 297 | peering) 298 | echo $peering 299 | ;; 300 | repair) 301 | echo $repair 302 | ;; 303 | recovering) 304 | echo $recovering 305 | ;; 306 | backfill) 307 | echo $backfill 308 | ;; 309 | waitBackfill) 310 | echo $waitBackfill 311 | ;; 312 | incomplete) 313 | echo $incomplete 314 | ;; 315 | stale) 316 | echo $stale 317 | ;; 318 | remapped) 319 | echo $remapped 320 | ;; 321 | rops) 322 | echo $rops 323 | ;; 324 | wops) 325 | echo $wops 326 | ;; 327 | ops) 328 | echo $ops 329 | ;; 330 | wrbps) 331 | echo $wrbps 332 | ;; 333 | rdbps) 334 | echo $rdbps 335 | ;; 336 | esac 337 | } 338 | 339 | function get_kv() 340 | { 341 | echo - ceph.health_detail \"$(ceph_get health_detail)\" \\n 342 | echo - ceph.health $(ceph_get health) \\n 343 | echo - ceph.count $(ceph_get count) \\n 344 | echo - ceph.osd_in $(ceph_get in) \\n 345 | echo - ceph.osd_up $(ceph_get up) \\n 346 | echo - ceph.active $(ceph_get active) \\n 347 | echo - ceph.backfill $(ceph_get backfill) \\n 348 | echo - ceph.clean $(ceph_get clean) \\n 349 | echo - ceph.creating $(ceph_get creating) \\n 350 | echo - ceph.degraded $(ceph_get degraded) \\n 351 | echo - ceph.degraded_percent $(ceph_get degraded_percent) \\n 352 | echo - ceph.down $(ceph_get down) \\n 353 | echo - ceph.incomplete $(ceph_get incomplete) \\n 354 | echo - ceph.inconsistent $(ceph_get inconsistent) \\n 355 | echo - ceph.peering $(ceph_get peering) \\n 356 | echo - ceph.recovering $(ceph_get recovering) \\n 357 | echo - ceph.remapped $(ceph_get remapped) \\n 358 | echo - ceph.repair $(ceph_get repair) \\n 359 | echo - ceph.replay $(ceph_get replay) \\n 360 | echo - ceph.scrubbing $(ceph_get scrubbing) \\n 361 | echo - ceph.splitting $(ceph_get splitting) \\n 362 | echo - ceph.stale $(ceph_get stale) \\n 363 | echo - ceph.pgtotal $(ceph_get pgtotal) \\n 364 | echo - ceph.waitBackfill $(ceph_get waitBackfill) \\n 365 | echo - ceph.mon $(ceph_get mon) \\n 366 | echo - ceph.rados_total $(ceph_get rados_total) \\n 367 | echo - ceph.rados_used $(ceph_get rados_used) \\n 368 | echo - ceph.rados_free $(ceph_get rados_free) \\n 369 | echo - ceph.wrbps $(ceph_get wrbps) \\n 370 | echo - ceph.rdbps $(ceph_get rdbps) \\n 371 | echo - ceph.ops $(ceph_get ops) \\n 372 | echo - ceph.rops $(ceph_get rops) \\n 373 | echo - ceph.wops $(ceph_get wops) 374 | 375 | } 376 | sleep $(echo $RANDOM%50|bc) 377 | echo -e $(get_kv) >/tmp/zabbix_kv.txt 378 | $zabbix_sender_bin -vv --zabbix-server $1 --host $2 --input-file /tmp/zabbix_kv.txt >/dev/null 2>&1 379 | echo $? 380 | 381 | -------------------------------------------------------------------------------- /ceph_cron.txt: -------------------------------------------------------------------------------- 1 | #ceph-status.sh serverActiveIP, ZabbixHost 2 | * * * * * /etc/zabbix/scripts/ceph-status.sh 10.25.195.3 10.25.195.8 3 | -------------------------------------------------------------------------------- /zabbix_templates/zbx_ceph_cluster_template.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 3.0 4 | 2016-11-02T01:48:52Z 5 | 6 | 7 | Templates 8 | 9 | 10 | 11 | 1684 | 1685 | 1686 | 1687 | {Templace_Ceph_Cluster_Active:ceph.degraded.last(0)}>0 1688 | Ceph cluster has degraded PGs 1689 | 1690 | 0 1691 | 2 1692 | Ceph has not replicated some objects in the placement group the correct number of times yet. 1693 | 0 1694 | 1695 | 1696 | 1697 | {Templace_Ceph_Cluster_Active:ceph.down.last(0)}>0 1698 | Ceph cluster has down PGs 1699 | 1700 | 0 1701 | 3 1702 | At least a replica with necessary data is down, so the placement group is offline. 1703 | 0 1704 | 1705 | 1706 | 1707 | {Templace_Ceph_Cluster_Active:ceph.ops.last(0)}>20000 1708 | Ceph cluster ops is up to 20000 1709 | 1710 | 0 1711 | 3 1712 | 1713 | 0 1714 | 1715 | 1716 | 1717 | {Templace_Ceph_Cluster_Active:ceph.rados_used_ratio.last()}>50 1718 | Ceph cluster space used ratio up to 50% 1719 | 1720 | 0 1721 | 4 1722 | Ceph集群需要及时扩容啦 1723 | 0 1724 | 1725 | 1726 | 1727 | {Templace_Ceph_Cluster_Active:ceph.health.nodata(10m)}=1 1728 | ceph monitor data is lost more than 10 minutes 1729 | 1730 | 0 1731 | 3 1732 | 1733 | 0 1734 | 1735 | 1736 | 1737 | {Templace_Ceph_Cluster_Active:ceph.health_detail.str("requests are blocked")}=1 1738 | ceph requests are blocked 1739 | 1740 | 0 1741 | 5 1742 | 1743 | 0 1744 | 1745 | 1746 | 1747 | {Templace_Ceph_Cluster_Active:ceph.health.min(3m)}=3 1748 | ceph service is down 1749 | 1750 | 0 1751 | 5 1752 | 以 HEALTH_WARN 中的 requests are blocked和 HEALTH_ERR 算做服务不可用,HEALTH_WARN 中其他情况作为服务不可用的,后续根据运维经验再补充。 1753 | 0 1754 | 1755 | 1756 | 1757 | 1758 | 1759 | Ceph cluster storage 1760 | 500 1761 | 200 1762 | 0.0000 1763 | 100.0000 1764 | 0 1765 | 0 1766 | 0 1767 | 1 1768 | 0 1769 | 0.0000 1770 | 0.0000 1771 | 1 1772 | 0 1773 | 0 1774 | 0 1775 | 1776 | 1777 | 0 1778 | 1 1779 | 00EE00 1780 | 0 1781 | 1 1782 | 0 1783 | 1784 | Templace_Ceph_Cluster_Active 1785 | ceph.rados_total 1786 | 1787 | 1788 | 1789 | 1 1790 | 1 1791 | EE0000 1792 | 0 1793 | 4 1794 | 0 1795 | 1796 | Templace_Ceph_Cluster_Active 1797 | ceph.rados_used 1798 | 1799 | 1800 | 1801 | 1802 | 1803 | Ceph Load 1804 | 900 1805 | 200 1806 | 0.0000 1807 | 100.0000 1808 | 1 1809 | 1 1810 | 0 1811 | 1 1812 | 0 1813 | 0.0000 1814 | 0.0000 1815 | 1 1816 | 0 1817 | 0 1818 | 0 1819 | 1820 | 1821 | 0 1822 | 0 1823 | 00C800 1824 | 0 1825 | 2 1826 | 0 1827 | 1828 | Templace_Ceph_Cluster_Active 1829 | ceph.wrbps 1830 | 1831 | 1832 | 1833 | 1 1834 | 0 1835 | 0000C8 1836 | 0 1837 | 2 1838 | 0 1839 | 1840 | Templace_Ceph_Cluster_Active 1841 | ceph.rdbps 1842 | 1843 | 1844 | 1845 | 2 1846 | 0 1847 | C80000 1848 | 1 1849 | 2 1850 | 0 1851 | 1852 | Templace_Ceph_Cluster_Active 1853 | ceph.ops 1854 | 1855 | 1856 | 1857 | 3 1858 | 0 1859 | A54F10 1860 | 0 1861 | 2 1862 | 0 1863 | 1864 | Templace_Ceph_Cluster_Active 1865 | ceph.rops 1866 | 1867 | 1868 | 1869 | 4 1870 | 0 1871 | FC6EA3 1872 | 0 1873 | 2 1874 | 0 1875 | 1876 | Templace_Ceph_Cluster_Active 1877 | ceph.wops 1878 | 1879 | 1880 | 1881 | 1882 | 1883 | Ceph space repartition 1884 | 500 1885 | 200 1886 | 0.0000 1887 | 0.0000 1888 | 0 1889 | 0 1890 | 2 1891 | 1 1892 | 0 1893 | 0.0000 1894 | 0.0000 1895 | 0 1896 | 0 1897 | 0 1898 | 0 1899 | 1900 | 1901 | 0 1902 | 0 1903 | 00EE00 1904 | 0 1905 | 2 1906 | 0 1907 | 1908 | Templace_Ceph_Cluster_Active 1909 | ceph.rados_free 1910 | 1911 | 1912 | 1913 | 1 1914 | 0 1915 | EE0000 1916 | 0 1917 | 2 1918 | 0 1919 | 1920 | Templace_Ceph_Cluster_Active 1921 | ceph.rados_used 1922 | 1923 | 1924 | 1925 | 1926 | 1927 | Degraded % 1928 | 900 1929 | 200 1930 | 0.0000 1931 | 100.0000 1932 | 1 1933 | 1 1934 | 0 1935 | 1 1936 | 0 1937 | 0.0000 1938 | 0.0000 1939 | 1 1940 | 0 1941 | 0 1942 | 0 1943 | 1944 | 1945 | 0 1946 | 5 1947 | CC0000 1948 | 0 1949 | 2 1950 | 0 1951 | 1952 | Templace_Ceph_Cluster_Active 1953 | ceph.degraded_percent 1954 | 1955 | 1956 | 1957 | 1958 | 1959 | Moving PGs 1960 | 900 1961 | 200 1962 | 0.0000 1963 | 100.0000 1964 | 1 1965 | 1 1966 | 0 1967 | 1 1968 | 0 1969 | 0.0000 1970 | 0.0000 1971 | 1 1972 | 0 1973 | 0 1974 | 0 1975 | 1976 | 1977 | 0 1978 | 0 1979 | C80000 1980 | 0 1981 | 2 1982 | 0 1983 | 1984 | Templace_Ceph_Cluster_Active 1985 | ceph.recovering 1986 | 1987 | 1988 | 1989 | 1 1990 | 0 1991 | 00C800 1992 | 0 1993 | 2 1994 | 0 1995 | 1996 | Templace_Ceph_Cluster_Active 1997 | ceph.remapped 1998 | 1999 | 2000 | 2001 | 2 2002 | 0 2003 | 0000C8 2004 | 0 2005 | 2 2006 | 0 2007 | 2008 | Templace_Ceph_Cluster_Active 2009 | ceph.peering 2010 | 2011 | 2012 | 2013 | 2014 | 2015 | OSDs 2016 | 900 2017 | 200 2018 | 0.0000 2019 | 100.0000 2020 | 1 2021 | 1 2022 | 0 2023 | 1 2024 | 0 2025 | 0.0000 2026 | 0.0000 2027 | 1 2028 | 1 2029 | 0 2030 | 0 2031 | 2032 | 2033 | 0 2034 | 5 2035 | 00EE00 2036 | 0 2037 | 2 2038 | 0 2039 | 2040 | Templace_Ceph_Cluster_Active 2041 | ceph.osd_up 2042 | 2043 | 2044 | 2045 | 1 2046 | 2 2047 | CC0000 2048 | 0 2049 | 2 2050 | 0 2051 | 2052 | Templace_Ceph_Cluster_Active 2053 | ceph.osd_in 2054 | 2055 | 2056 | 2057 | 2058 | 2059 | PGS 2060 | 900 2061 | 200 2062 | 0.0000 2063 | 100.0000 2064 | 1 2065 | 1 2066 | 0 2067 | 1 2068 | 0 2069 | 0.0000 2070 | 0.0000 2071 | 1 2072 | 0 2073 | 0 2074 | 0 2075 | 2076 | 2077 | 0 2078 | 5 2079 | 0000EE 2080 | 0 2081 | 2 2082 | 0 2083 | 2084 | Templace_Ceph_Cluster_Active 2085 | ceph.active 2086 | 2087 | 2088 | 2089 | 1 2090 | 2 2091 | 00EE00 2092 | 0 2093 | 2 2094 | 0 2095 | 2096 | Templace_Ceph_Cluster_Active 2097 | ceph.clean 2098 | 2099 | 2100 | 2101 | 2102 | 2103 | Problem PGs 2104 | 900 2105 | 200 2106 | 0.0000 2107 | 100.0000 2108 | 1 2109 | 1 2110 | 0 2111 | 1 2112 | 0 2113 | 0.0000 2114 | 0.0000 2115 | 1 2116 | 0 2117 | 0 2118 | 0 2119 | 2120 | 2121 | 0 2122 | 0 2123 | 00EE00 2124 | 0 2125 | 2 2126 | 0 2127 | 2128 | Templace_Ceph_Cluster_Active 2129 | ceph.degraded 2130 | 2131 | 2132 | 2133 | 1 2134 | 0 2135 | 0000C8 2136 | 0 2137 | 2 2138 | 0 2139 | 2140 | Templace_Ceph_Cluster_Active 2141 | ceph.incomplete 2142 | 2143 | 2144 | 2145 | 2 2146 | 0 2147 | C800C8 2148 | 0 2149 | 2 2150 | 0 2151 | 2152 | Templace_Ceph_Cluster_Active 2153 | ceph.inconsistent 2154 | 2155 | 2156 | 2157 | 3 2158 | 0 2159 | EE0000 2160 | 0 2161 | 2 2162 | 0 2163 | 2164 | Templace_Ceph_Cluster_Active 2165 | ceph.down 2166 | 2167 | 2168 | 2169 | 2170 | 2171 | 2172 | -------------------------------------------------------------------------------- /zabbix_templates/zbx_ceph_cluster_template_v1_format.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 0 6 | 1 7 | 8 | 127.0.0.1 9 | 10050 10 | 3 11 | 0 12 | 127.0.0.1 13 | 623 14 | 0 15 | 2 16 | 17 | 18 | 19 | Templates 20 | 21 | 22 | 23 | Ceph cluster has degraded PGs 24 | 0 25 | {Templace_Ceph_Cluster:ceph.degraded.last(0)}>0 26 | 27 | 0 28 | 4 29 | Ceph has not replicated some objects in the placement group the correct number of times yet. 30 | 31 | 32 | Ceph cluster has down PGs 33 | 0 34 | {Templace_Ceph_Cluster:ceph.down.last(0)}>0 35 | 36 | 0 37 | 3 38 | At least a replica with necessary data is down, so the placement group is offline. 39 | 40 | 41 | 42 | 43 | Ceph Cluster health 44 | 0 45 | 46 | 47 | 30 48 | 90 49 | 365 50 | 0 51 | 52 | 0 53 | 54 | 0 55 | 56 | 57 | 1 58 | 59 | 60 | 61 | 0 62 | 0 63 | 64 | 65 | 66 | 67 | 68 | 69 | Ceph Cluster 70 | 71 | 72 | 161 73 | 74 | 75 | Ceph active MON 76 | 77 | 0 78 | 79 | 30 80 | 90 81 | 365 82 | 0 83 | 0 84 | 85 | 0 86 | 87 | 0 88 | 89 | 90 | 1 91 | 92 | 93 | 94 | 0 95 | 0 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | Ceph Cluster 104 | 105 | 161 106 | 107 | 108 | Ceph Operation 109 | 110 | 0 111 | 112 | 30 113 | 90 114 | 365 115 | 0 116 | 0 117 | op/s 118 | 0 119 | 120 | 0 121 | 122 | 123 | 1 124 | 125 | 126 | 127 | 0 128 | 0 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | Ceph Cluster 137 | 138 | 161 139 | 140 | 141 | Ceph OSD count 142 | 143 | 0 144 | 145 | 30 146 | 90 147 | 365 148 | 0 149 | 150 | 0 151 | 152 | 0 153 | 154 | 155 | 1 156 | 157 | 158 | 159 | 0 160 | 0 161 | 162 | 163 | 164 | 165 | 166 | 167 | Ceph Cluster 168 | 169 | 161 170 | 171 | 172 | Ceph OSD in % 173 | 174 | 0 175 | 176 | 30 177 | 90 178 | 365 179 | 0 180 | 0 181 | % 182 | 0 183 | 184 | 0 185 | 186 | 187 | 1 188 | 189 | 190 | 191 | 0 192 | 0 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | Ceph Cluster 201 | 202 | 161 203 | 204 | 205 | Ceph OSD up % 206 | 207 | 0 208 | 209 | 30 210 | 90 211 | 365 212 | 0 213 | 0 214 | % 215 | 0 216 | 217 | 0 218 | 219 | 220 | 1 221 | 222 | 223 | 224 | 0 225 | 0 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | Ceph Cluster 234 | 235 | 161 236 | 237 | 238 | Ceph PG active 239 | 240 | 0 241 | 242 | 30 243 | 90 244 | 365 245 | 0 246 | 0 247 | 248 | 0 249 | 250 | 0 251 | 252 | 253 | 1 254 | 255 | 256 | 257 | 0 258 | 0 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | Ceph Cluster 267 | 268 | 161 269 | 270 | 271 | Ceph PG backfill 272 | 273 | 0 274 | 275 | 30 276 | 90 277 | 365 278 | 0 279 | 0 280 | 281 | 0 282 | 283 | 0 284 | 285 | 286 | 1 287 | 288 | 289 | 290 | 0 291 | 0 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | Ceph Cluster 300 | 301 | 161 302 | 303 | 304 | Ceph PG clean 305 | 306 | 0 307 | 308 | 30 309 | 90 310 | 365 311 | 0 312 | 0 313 | 314 | 0 315 | 316 | 0 317 | 318 | 319 | 1 320 | 321 | 322 | 323 | 0 324 | 0 325 | 326 | 327 | 328 | 329 | 330 | 331 | 332 | Ceph Cluster 333 | 334 | 161 335 | 336 | 337 | Ceph PG creating 338 | 339 | 0 340 | 341 | 30 342 | 90 343 | 365 344 | 0 345 | 0 346 | 347 | 0 348 | 349 | 0 350 | 351 | 352 | 1 353 | 354 | 355 | 356 | 0 357 | 0 358 | 359 | 360 | 361 | 362 | 363 | 364 | 365 | Ceph Cluster 366 | 367 | 161 368 | 369 | 370 | Ceph PG degraded 371 | 372 | 0 373 | 374 | 30 375 | 90 376 | 365 377 | 0 378 | 0 379 | 380 | 0 381 | 382 | 0 383 | 384 | 385 | 1 386 | 387 | 388 | 389 | 0 390 | 0 391 | 392 | 393 | 394 | 395 | 396 | 397 | 398 | Ceph Cluster 399 | 400 | 161 401 | 402 | 403 | Ceph PG degraded % 404 | 405 | 0 406 | 407 | 30 408 | 90 409 | 365 410 | 0 411 | 0 412 | % 413 | 0 414 | 415 | 0 416 | 417 | 418 | 1 419 | 420 | 421 | 422 | 0 423 | 0 424 | 425 | 426 | 427 | 428 | 429 | 430 | 431 | Ceph Cluster 432 | 433 | 161 434 | 435 | 436 | Ceph PG down 437 | 438 | 0 439 | 440 | 30 441 | 90 442 | 365 443 | 0 444 | 0 445 | 446 | 0 447 | 448 | 0 449 | 450 | 451 | 1 452 | 453 | 454 | 455 | 0 456 | 0 457 | 458 | 459 | 460 | 461 | 462 | 463 | 464 | Ceph Cluster 465 | 466 | 161 467 | 468 | 469 | Ceph PG incomplete 470 | 471 | 0 472 | 473 | 30 474 | 90 475 | 365 476 | 0 477 | 0 478 | 479 | 0 480 | 481 | 0 482 | 483 | 484 | 1 485 | 486 | 487 | 488 | 0 489 | 0 490 | 491 | 492 | 493 | 494 | 495 | 496 | 497 | Ceph Cluster 498 | 499 | 161 500 | 501 | 502 | Ceph PG inconsistent 503 | 504 | 0 505 | 506 | 30 507 | 90 508 | 365 509 | 0 510 | 0 511 | 512 | 0 513 | 514 | 0 515 | 516 | 517 | 1 518 | 519 | 520 | 521 | 0 522 | 0 523 | 524 | 525 | 526 | 527 | 528 | 529 | 530 | Ceph Cluster 531 | 532 | 161 533 | 534 | 535 | Ceph PG peering 536 | 537 | 0 538 | 539 | 30 540 | 90 541 | 365 542 | 0 543 | 0 544 | 545 | 0 546 | 547 | 0 548 | 549 | 550 | 1 551 | 552 | 553 | 554 | 0 555 | 0 556 | 557 | 558 | 559 | 560 | 561 | 562 | 563 | Ceph Cluster 564 | 565 | 161 566 | 567 | 568 | Ceph PG recovering 569 | 570 | 0 571 | 572 | 30 573 | 90 574 | 365 575 | 0 576 | 0 577 | 578 | 0 579 | 580 | 0 581 | 582 | 583 | 1 584 | 585 | 586 | 587 | 0 588 | 0 589 | 590 | 591 | 592 | 593 | 594 | 595 | 596 | Ceph Cluster 597 | 598 | 161 599 | 600 | 601 | Ceph PG remapped 602 | 603 | 0 604 | 605 | 30 606 | 90 607 | 365 608 | 0 609 | 0 610 | 611 | 0 612 | 613 | 0 614 | 615 | 616 | 1 617 | 618 | 619 | 620 | 0 621 | 0 622 | 623 | 624 | 625 | 626 | 627 | 628 | 629 | Ceph Cluster 630 | 631 | 161 632 | 633 | 634 | Ceph PG repair 635 | 636 | 0 637 | 638 | 30 639 | 90 640 | 365 641 | 0 642 | 0 643 | 644 | 0 645 | 646 | 0 647 | 648 | 649 | 1 650 | 651 | 652 | 653 | 0 654 | 0 655 | 656 | 657 | 658 | 659 | 660 | 661 | 662 | Ceph Cluster 663 | 664 | 161 665 | 666 | 667 | Ceph PG replay 668 | 669 | 0 670 | 671 | 30 672 | 90 673 | 365 674 | 0 675 | 0 676 | 677 | 0 678 | 679 | 0 680 | 681 | 682 | 1 683 | 684 | 685 | 686 | 0 687 | 0 688 | 689 | 690 | 691 | 692 | 693 | 694 | 695 | Ceph Cluster 696 | 697 | 161 698 | 699 | 700 | Ceph PG scrubbing 701 | 702 | 0 703 | 704 | 30 705 | 90 706 | 365 707 | 0 708 | 0 709 | 710 | 0 711 | 712 | 0 713 | 714 | 715 | 1 716 | 717 | 718 | 719 | 0 720 | 0 721 | 722 | 723 | 724 | 725 | 726 | 727 | 728 | Ceph Cluster 729 | 730 | 161 731 | 732 | 733 | Ceph PG splitting 734 | 735 | 0 736 | 737 | 30 738 | 90 739 | 365 740 | 0 741 | 0 742 | 743 | 0 744 | 745 | 0 746 | 747 | 748 | 1 749 | 750 | 751 | 752 | 0 753 | 0 754 | 755 | 756 | 757 | 758 | 759 | 760 | 761 | Ceph Cluster 762 | 763 | 161 764 | 765 | 766 | Ceph PG stale 767 | 768 | 0 769 | 770 | 30 771 | 90 772 | 365 773 | 0 774 | 0 775 | 776 | 0 777 | 778 | 0 779 | 780 | 781 | 1 782 | 783 | 784 | 785 | 0 786 | 0 787 | 788 | 789 | 790 | 791 | 792 | 793 | 794 | Ceph Cluster 795 | 796 | 161 797 | 798 | 799 | Ceph PG total 800 | 801 | 0 802 | 803 | 300 804 | 90 805 | 365 806 | 0 807 | 0 808 | 809 | 0 810 | 811 | 0 812 | 813 | 814 | 1 815 | 816 | 817 | 818 | 0 819 | 0 820 | 821 | 822 | 823 | 824 | 825 | 826 | 827 | Ceph Cluster 828 | 829 | 161 830 | 831 | 832 | Ceph PG wait-backfill 833 | 834 | 0 835 | 836 | 30 837 | 90 838 | 365 839 | 0 840 | 0 841 | 842 | 0 843 | 844 | 0 845 | 846 | 847 | 1 848 | 849 | 850 | 851 | 0 852 | 0 853 | 854 | 855 | 856 | 857 | 858 | 859 | 860 | Ceph Cluster 861 | 862 | 161 863 | 864 | 865 | Ceph rados free space 866 | 867 | 1 868 | 869 | 30 870 | 90 871 | 365 872 | 0 873 | 0 874 | B 875 | 0 876 | 877 | 0 878 | 879 | 880 | 1024 881 | 882 | 883 | 884 | 0 885 | 0 886 | 887 | 888 | 889 | 890 | 891 | 892 | 893 | Ceph Cluster 894 | 895 | 161 896 | 897 | 898 | Ceph rados total space 899 | 900 | 1 901 | 902 | 300 903 | 90 904 | 365 905 | 0 906 | 0 907 | B 908 | 0 909 | 910 | 0 911 | 912 | 913 | 1024 914 | 915 | 916 | 917 | 0 918 | 0 919 | 920 | 921 | 922 | 923 | 924 | 925 | 926 | Ceph Cluster 927 | 928 | 161 929 | 930 | 931 | Ceph rados used space 932 | 933 | 1 934 | 935 | 30 936 | 90 937 | 365 938 | 0 939 | 0 940 | B 941 | 0 942 | 943 | 0 944 | 945 | 946 | 1024 947 | 948 | 949 | 950 | 0 951 | 0 952 | 953 | 954 | 955 | 956 | 957 | 958 | 959 | Ceph Cluster 960 | 961 | 161 962 | 963 | 964 | Ceph Write Speed 965 | 966 | 0 967 | 968 | 30 969 | 90 970 | 365 971 | 0 972 | 0 973 | B/s 974 | 0 975 | 976 | 0 977 | 978 | 979 | 1 980 | 981 | 982 | 983 | 0 984 | 0 985 | 986 | 987 | 988 | 989 | 990 | 991 | 992 | Ceph Cluster 993 | 994 | 161 995 | 996 | 997 | Ceph Read Speed 998 | 999 | 0 1000 | 1001 | 30 1002 | 90 1003 | 365 1004 | 0 1005 | 0 1006 | B/s 1007 | 0 1008 | 1009 | 0 1010 | 1011 | 1012 | 1 1013 | 1014 | 1015 | 1016 | 0 1017 | 0 1018 | 1019 | 1020 | 1021 | 1022 | 1023 | 1024 | 1025 | Ceph Cluster 1026 | 1027 | 161 1028 | 1029 | 1030 | 1031 | 1032 | 1033 | 0.0000 1034 | 100.0000 1035 | 0 1036 | 0 1037 | 0 1038 | 1 1039 | 0 1040 | 0.0000 1041 | 0.0000 1042 | 1 1043 | 0 1044 | 1045 | 1046 | 1047 | 1048 | 0 1049 | 1 1050 | 00EE00 1051 | 0 1052 | 1 1053 | 0 1054 | 5 1055 | 1056 | 1057 | 1 1058 | 1 1059 | EE0000 1060 | 0 1061 | 4 1062 | 0 1063 | 5 1064 | 1065 | 1066 | 1067 | 1068 | 0.0000 1069 | 100.0000 1070 | 1 1071 | 1 1072 | 0 1073 | 1 1074 | 0 1075 | 0.0000 1076 | 0.0000 1077 | 1 1078 | 0 1079 | 1080 | 1081 | 1082 | 1083 | 1 1084 | 1 1085 | C80000 1086 | 1 1087 | 2 1088 | 0 1089 | 5 1090 | 1091 | 1092 | 0 1093 | 0 1094 | 00C800 1095 | 0 1096 | 2 1097 | 0 1098 | 5 1099 | 1100 | 1101 | 0 1102 | 0 1103 | 0000C8 1104 | 0 1105 | 2 1106 | 0 1107 | 5 1108 | 1109 | 1110 | 1111 | 1112 | 0.0000 1113 | 0.0000 1114 | 0 1115 | 0 1116 | 2 1117 | 1 1118 | 0 1119 | 0.0000 1120 | 0.0000 1121 | 0 1122 | 0 1123 | 1124 | 1125 | 1126 | 1127 | 0 1128 | 0 1129 | 00EE00 1130 | 0 1131 | 2 1132 | 0 1133 | 5 1134 | 1135 | 1136 | 1 1137 | 0 1138 | EE0000 1139 | 0 1140 | 2 1141 | 0 1142 | 5 1143 | 1144 | 1145 | 1146 | 1147 | 0.0000 1148 | 100.0000 1149 | 1 1150 | 1 1151 | 0 1152 | 1 1153 | 0 1154 | 0.0000 1155 | 0.0000 1156 | 1 1157 | 0 1158 | 1159 | 1160 | 1161 | 1162 | 0 1163 | 5 1164 | CC0000 1165 | 0 1166 | 2 1167 | 0 1168 | 5 1169 | 1170 | 1171 | 1172 | 1173 | 0.0000 1174 | 100.0000 1175 | 1 1176 | 1 1177 | 0 1178 | 1 1179 | 0 1180 | 0.0000 1181 | 0.0000 1182 | 1 1183 | 0 1184 | 1185 | 1186 | 1187 | 1188 | 0 1189 | 0 1190 | C80000 1191 | 0 1192 | 2 1193 | 0 1194 | 5 1195 | 1196 | 1197 | 1 1198 | 0 1199 | 00C800 1200 | 0 1201 | 2 1202 | 0 1203 | 5 1204 | 1205 | 1206 | 2 1207 | 0 1208 | 0000C8 1209 | 0 1210 | 2 1211 | 0 1212 | 5 1213 | 1214 | 1215 | 1216 | 1217 | 0.0000 1218 | 100.0000 1219 | 1 1220 | 1 1221 | 0 1222 | 1 1223 | 0 1224 | 0.0000 1225 | 0.0000 1226 | 1 1227 | 1 1228 | 1229 | 1230 | 1231 | 1232 | 0 1233 | 5 1234 | 00EE00 1235 | 0 1236 | 2 1237 | 0 1238 | 5 1239 | 1240 | 1241 | 1 1242 | 2 1243 | CC0000 1244 | 0 1245 | 2 1246 | 0 1247 | 5 1248 | 1249 | 1250 | 1251 | 1252 | 0.0000 1253 | 100.0000 1254 | 1 1255 | 1 1256 | 0 1257 | 1 1258 | 0 1259 | 0.0000 1260 | 0.0000 1261 | 1 1262 | 0 1263 | 1264 | 1265 | 1266 | 1267 | 1 1268 | 2 1269 | 00EE00 1270 | 0 1271 | 2 1272 | 0 1273 | 5 1274 | 1275 | 1276 | 0 1277 | 5 1278 | 0000EE 1279 | 0 1280 | 2 1281 | 0 1282 | 5 1283 | 1284 | 1285 | 1286 | 1287 | 0.0000 1288 | 100.0000 1289 | 1 1290 | 1 1291 | 0 1292 | 1 1293 | 0 1294 | 0.0000 1295 | 0.0000 1296 | 1 1297 | 0 1298 | 1299 | 1300 | 1301 | 1302 | 0 1303 | 0 1304 | 00EE00 1305 | 0 1306 | 2 1307 | 0 1308 | 5 1309 | 1310 | 1311 | 3 1312 | 0 1313 | EE0000 1314 | 0 1315 | 2 1316 | 0 1317 | 5 1318 | 1319 | 1320 | 1 1321 | 0 1322 | 0000C8 1323 | 0 1324 | 2 1325 | 0 1326 | 5 1327 | 1328 | 1329 | 2 1330 | 0 1331 | C800C8 1332 | 0 1333 | 2 1334 | 0 1335 | 5 1336 | 1337 | 1338 | 1339 | 1340 | 1341 | 1342 | 1343 | 1344 | 1345 | -------------------------------------------------------------------------------- /zabbix_templates/zbx_ceph_mds_template.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 2.0 4 | 2013-02-25T14:35:08Z 5 | 6 | 7 | Templates 8 | 9 | 10 | 11 | 70 | 71 | 72 | 73 | {Template_Ceph_mds:proc.num[ceph-mds].last(0)}=0 74 | Ceph MDS is not running on {HOSTNAME} 75 | 76 | 0 77 | 4 78 | 79 | 0 80 | 81 | 82 | 83 | 84 | -------------------------------------------------------------------------------- /zabbix_templates/zbx_ceph_mon_template.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 2.0 4 | 2013-02-25T14:34:53Z 5 | 6 | 7 | Templates 8 | 9 | 10 | 11 | 70 | 71 | 72 | 73 | {Template_Ceph_mon:proc.num[ceph-mon].last(0)}=0 74 | Ceph MON is not running on {HOSTNAME} 75 | 76 | 0 77 | 4 78 | 79 | 0 80 | 81 | 82 | 83 | 84 | -------------------------------------------------------------------------------- /zabbix_templates/zbx_ceph_osd_template.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 2.0 4 | 2013-02-25T14:34:30Z 5 | 6 | 7 | Templates 8 | 9 | 10 | 11 | 70 | 71 | 72 | 73 | {Template_Ceph_osd:proc.num[ceph-osd].last(0)}=0 74 | Ceph OSD is down on {HOSTNAME} 75 | 76 | 0 77 | 4 78 | 79 | 0 80 | 81 | 82 | 83 | 84 | --------------------------------------------------------------------------------