├── debian ├── compat ├── source │ └── format ├── ceph-medic.lintian-overrides ├── rules ├── changelog ├── control └── copyright ├── ceph_medic ├── checks │ ├── clients.py │ ├── mdss.py │ ├── mgrs.py │ ├── rgws.py │ ├── __init__.py │ ├── cluster.py │ ├── osds.py │ ├── mons.py │ └── common.py ├── rules │ ├── __init__.py │ ├── jewel.py │ └── kraken.py ├── tests │ ├── util │ │ ├── __init__.py │ │ ├── test_configuration.py │ │ └── test_hosts.py │ ├── checks │ │ ├── __init__.py │ │ ├── test_cluster.py │ │ ├── test_osds.py │ │ └── test_mons.py │ ├── remote │ │ ├── __init__.py │ │ ├── test_commands.py │ │ └── test_functions.py │ ├── __init__.py │ ├── test_terminal.py │ ├── test_main.py │ ├── test_log.py │ ├── test_collector.py │ ├── conftest.py │ └── test_runner.py ├── remote │ ├── __init__.py │ ├── util.py │ ├── commands.py │ └── functions.py ├── util │ ├── net.py │ ├── __init__.py │ ├── mon.py │ └── hosts.py ├── compat.py ├── log.py ├── __init__.py ├── loader.py ├── decorators.py ├── check.py ├── generate.py ├── connection.py ├── terminal.py ├── main.py ├── runner.py └── collector.py ├── docs ├── .gitignore ├── source │ ├── _static │ │ └── .empty │ ├── _themes │ │ └── ceph │ │ │ ├── theme.conf │ │ │ └── static │ │ │ ├── font │ │ │ ├── ApexSans-Book.eot │ │ │ ├── ApexSans-Book.ttf │ │ │ ├── ApexSans-Book.woff │ │ │ ├── ApexSans-Medium.eot │ │ │ ├── ApexSans-Medium.ttf │ │ │ └── ApexSans-Medium.woff │ │ │ └── nature.css_t │ ├── contents.rst │ ├── codes │ │ ├── cluster.rst │ │ ├── mons.rst │ │ ├── osds.rst │ │ └── common.rst │ ├── codes.rst │ ├── _templates │ │ └── smarttoc.html │ ├── changelog.rst │ ├── installation.rst │ ├── facts.rst │ ├── conf.py │ └── index.rst └── Makefile ├── requirements-dev.txt ├── tests └── functional │ ├── centos7 │ ├── Vagrantfile │ ├── hosts │ ├── test.yml │ ├── group_vars │ │ └── all │ └── vagrant_variables.yml │ ├── .gitignore │ ├── scripts │ └── generate_ssh_config.sh │ ├── tox.ini │ ├── playbooks │ └── setup.yml │ └── Vagrantfile ├── setup.cfg ├── MANIFEST.in ├── bin └── ceph-medic ├── .gitignore ├── README.rst ├── tox.ini ├── LICENSE ├── ceph-medic.spec.in ├── CONTRIBUTING.rst ├── Makefile └── setup.py /debian/compat: -------------------------------------------------------------------------------- 1 | 7 2 | -------------------------------------------------------------------------------- /ceph_medic/checks/clients.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ceph_medic/checks/mdss.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ceph_medic/checks/mgrs.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ceph_medic/checks/rgws.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ceph_medic/rules/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/.gitignore: -------------------------------------------------------------------------------- 1 | build 2 | -------------------------------------------------------------------------------- /docs/source/_static/.empty: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ceph_medic/tests/util/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ceph_medic/tests/checks/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ceph_medic/tests/remote/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /debian/source/format: -------------------------------------------------------------------------------- 1 | 3.0 (quilt) 2 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | pytest >=2.1.3 2 | tox >=1.2 3 | -------------------------------------------------------------------------------- /tests/functional/centos7/Vagrantfile: -------------------------------------------------------------------------------- 1 | ../Vagrantfile -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [tool:pytest] 2 | norecursedirs = .* _* virtualenv 3 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.rst 2 | include LICENSE 3 | include tox.ini 4 | -------------------------------------------------------------------------------- /tests/functional/.gitignore: -------------------------------------------------------------------------------- 1 | *.vdi 2 | .vagrant/ 3 | vagrant_ssh_config 4 | -------------------------------------------------------------------------------- /ceph_medic/remote/__init__.py: -------------------------------------------------------------------------------- 1 | from . import functions # noqa 2 | from . import commands # noqa 3 | -------------------------------------------------------------------------------- /ceph_medic/checks/__init__.py: -------------------------------------------------------------------------------- 1 | from . import osds, mons, clients, rgws, mdss, common, mgrs, cluster # noqa 2 | -------------------------------------------------------------------------------- /docs/source/_themes/ceph/theme.conf: -------------------------------------------------------------------------------- 1 | [theme] 2 | inherit = basic 3 | stylesheet = nature.css 4 | pygments_style = tango 5 | -------------------------------------------------------------------------------- /debian/ceph-medic.lintian-overrides: -------------------------------------------------------------------------------- 1 | # Package has not yet been submitted to Debian. 2 | new-package-should-close-itp-bug 3 | -------------------------------------------------------------------------------- /bin/ceph-medic: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from ceph_medic import main 4 | 5 | if __name__ == '__main__': 6 | main.Medic() 7 | -------------------------------------------------------------------------------- /docs/source/_themes/ceph/static/font/ApexSans-Book.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ceph/ceph-medic/HEAD/docs/source/_themes/ceph/static/font/ApexSans-Book.eot -------------------------------------------------------------------------------- /docs/source/_themes/ceph/static/font/ApexSans-Book.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ceph/ceph-medic/HEAD/docs/source/_themes/ceph/static/font/ApexSans-Book.ttf -------------------------------------------------------------------------------- /docs/source/_themes/ceph/static/font/ApexSans-Book.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ceph/ceph-medic/HEAD/docs/source/_themes/ceph/static/font/ApexSans-Book.woff -------------------------------------------------------------------------------- /docs/source/_themes/ceph/static/font/ApexSans-Medium.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ceph/ceph-medic/HEAD/docs/source/_themes/ceph/static/font/ApexSans-Medium.eot -------------------------------------------------------------------------------- /docs/source/_themes/ceph/static/font/ApexSans-Medium.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ceph/ceph-medic/HEAD/docs/source/_themes/ceph/static/font/ApexSans-Medium.ttf -------------------------------------------------------------------------------- /docs/source/_themes/ceph/static/font/ApexSans-Medium.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ceph/ceph-medic/HEAD/docs/source/_themes/ceph/static/font/ApexSans-Medium.woff -------------------------------------------------------------------------------- /tests/functional/centos7/hosts: -------------------------------------------------------------------------------- 1 | [mons] 2 | mon0 address=192.168.3.10 3 | 4 | [osds] 5 | osd0 address=192.168.3.100 6 | 7 | [medic] 8 | client0 address=192.168.3.40 9 | -------------------------------------------------------------------------------- /docs/source/contents.rst: -------------------------------------------------------------------------------- 1 | ceph-medic contents 2 | =================== 3 | 4 | .. toctree:: 5 | :maxdepth: 2 6 | 7 | index.rst 8 | installation.rst 9 | codes.rst 10 | facts.rst 11 | changelog.rst 12 | -------------------------------------------------------------------------------- /ceph_medic/tests/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | # helps reset altered metadata in tests 3 | base_metadata = {'rgws': {}, 'mgrs': {}, 'mdss':{}, 'clients': {}, 4 | 'osds':{}, 'mons':{}, 'nodes': {}, 'cluster_name': 'ceph', 'failed_nodes': {}} 5 | 6 | -------------------------------------------------------------------------------- /debian/rules: -------------------------------------------------------------------------------- 1 | #!/usr/bin/make -f 2 | 3 | # Uncomment this to turn on verbose mode. 4 | export DH_VERBOSE=1 5 | 6 | export PYBUILD_NAME=ceph-medic 7 | 8 | export PYBUILD_TEST_ARGS=ceph_medic/tests 9 | 10 | %: 11 | dh $@ --buildsystem pybuild --with python2 12 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[co] 2 | 3 | # Packages 4 | *.egg 5 | *.egg-info 6 | .cache 7 | dist 8 | build 9 | eggs 10 | parts 11 | bin 12 | var 13 | sdist 14 | develop-eggs 15 | .installed.cfg 16 | 17 | # Installer logs 18 | pip-log.txt 19 | 20 | # Unit test / coverage reports 21 | .coverage 22 | .tox 23 | 24 | #Translations 25 | *.mo 26 | -------------------------------------------------------------------------------- /tests/functional/centos7/test.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - hosts: medic 4 | become: no 5 | tasks: 6 | 7 | - name: copy hosts file to vagrant home dir 8 | command: cp /vagrant/hosts /home/vagrant 9 | become: yes 10 | 11 | - name: use ceph-medic to check ceph cluster 12 | command: ceph-medic --inventory /home/vagrant/hosts check 13 | -------------------------------------------------------------------------------- /ceph_medic/util/net.py: -------------------------------------------------------------------------------- 1 | import socket 2 | 3 | 4 | def host_is_resolvable(hostname, _socket=None): 5 | _socket = _socket or socket # just used for testing 6 | try: 7 | _socket.getaddrinfo(hostname, 0) 8 | except _socket.gaierror: 9 | msg = "hostname: %s is not resolvable" % hostname 10 | raise RuntimeError(msg) 11 | return True 12 | -------------------------------------------------------------------------------- /docs/source/codes/cluster.rst: -------------------------------------------------------------------------------- 1 | Cluster 2 | ======= 3 | Cluster checks run once against the information of a cluster, and are 4 | not specific to any deamon. 5 | 6 | 7 | Errors 8 | ------ 9 | 10 | .. _ECLS1: 11 | 12 | ECLS1 13 | ^^^^^ 14 | No OSD nodes exist as part of the cluster. 15 | 16 | .. _ECLS2: 17 | 18 | ECLS2 19 | ^^^^^ 20 | The cluster is nearfull. 21 | 22 | 23 | -------------------------------------------------------------------------------- /tests/functional/scripts/generate_ssh_config.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Generate a custom ssh config from Vagrant so that it can then be used by 3 | # ansible.cfg 4 | 5 | path=$1 6 | 7 | if [ $# -eq 0 ] 8 | then 9 | echo "A path to the scenario is required as an argument and it wasn't provided" 10 | exit 1 11 | fi 12 | 13 | cd "$path" 14 | vagrant ssh-config > vagrant_ssh_config 15 | -------------------------------------------------------------------------------- /ceph_medic/compat.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | 3 | try: 4 | import ConfigParser as configparser 5 | except ImportError: 6 | import configparser 7 | 8 | try: 9 | from ConfigParser import SafeConfigParser as BaseConfigParser 10 | except ImportError: 11 | from configparser import ConfigParser as BaseConfigParser 12 | 13 | try: 14 | from StringIO import StringIO 15 | except ImportError: 16 | from io import StringIO 17 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ========== 2 | ceph-medic 3 | ========== 4 | 5 | ``ceph-medic`` is a tool that performs checks against Ceph clusters to identify common issues preventing proper functionality. It supports Kubernetes and OpenShift, using ``kubectl`` and ``oc``, respectively. It requires non-interactive SSH access to accounts that can ``sudo`` without a password prompt. 6 | 7 | Full usage documentation can be found at: http://docs.ceph.com/ceph-medic 8 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py27, py36, py37, flake8 3 | 4 | [testenv] 5 | deps= 6 | pytest 7 | mock 8 | commands=py.test -v {posargs:ceph_medic/tests} 9 | 10 | [testenv:docs] 11 | basepython=python 12 | changedir=docs/source 13 | deps=sphinx 14 | commands= 15 | sphinx-build -W -b html -d {envtmpdir}/doctrees . {envtmpdir}/html 16 | 17 | [testenv:flake8] 18 | deps=flake8 19 | commands=flake8 --select=F,E9 --exclude=vendor {posargs:ceph_medic} 20 | -------------------------------------------------------------------------------- /tests/functional/centos7/group_vars/all: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | ceph_origin: repository 4 | ceph_repository: community 5 | ceph_stable_release: luminous 6 | cluster: test 7 | public_network: "192.168.3.0/24" 8 | cluster_network: "192.168.4.0/24" 9 | monitor_interface: eth1 10 | journal_size: 100 11 | osd_objectstore: "filestore" 12 | devices: 13 | - '/dev/sda' 14 | - '/dev/sdb' 15 | osd_scenario: collocated 16 | os_tuning_params: 17 | - { name: kernel.pid_max, value: 4194303 } 18 | - { name: fs.file-max, value: 26234859 } 19 | -------------------------------------------------------------------------------- /ceph_medic/rules/jewel.py: -------------------------------------------------------------------------------- 1 | # Rules to apply for Jewel releases. 2 | 3 | # All checks are applied, but overrides to defaults can 4 | # be specified here. 5 | # overrides = { 6 | # # overrides the check called 'check_name' with a different expected value 7 | # # and changes # the level of this check to 'error'. 8 | # "check_name": {"expected": "value", "level": "error"}, 9 | #} 10 | 11 | # Exclude the following checks: 12 | # excludes = ["check_name"] 13 | 14 | # Include the following checks: 15 | # includes = ["check_name"] 16 | -------------------------------------------------------------------------------- /ceph_medic/rules/kraken.py: -------------------------------------------------------------------------------- 1 | # Rules to apply for Jewel releases. 2 | 3 | # All checks are applied, but overrides to defaults can 4 | # be specified here. 5 | # overrides = { 6 | # # overrides the check called 'check_name' with a different expected value 7 | # # and changes # the level of this check to 'error'. 8 | # "check_name": {"expected": "value", "level": "error"}, 9 | #} 10 | 11 | # Exclude the following checks: 12 | # excludes = ["check_name"] 13 | 14 | # Include the following checks: 15 | # includes = ["check_name"] 16 | -------------------------------------------------------------------------------- /docs/source/codes.rst: -------------------------------------------------------------------------------- 1 | =========== 2 | Error Codes 3 | =========== 4 | 5 | When performing checks, ``ceph-medic`` will return an error code and message for any that failed. These checks 6 | can either be a ``warning`` or ``error``, and will pertain to common issues or daemon specific issues. Any error 7 | code starting with ``E`` is an error, and any starting with ``W`` is a warning. 8 | 9 | Below you'll find a list of checks that are performed with the ``check`` subcommand. 10 | 11 | 12 | .. toctree:: 13 | :maxdepth: 2 14 | 15 | codes/common.rst 16 | codes/mons.rst 17 | codes/osds.rst 18 | codes/cluster.rst 19 | -------------------------------------------------------------------------------- /ceph_medic/checks/cluster.py: -------------------------------------------------------------------------------- 1 | from ceph_medic import metadata 2 | 3 | 4 | # 5 | # Error checks 6 | # 7 | 8 | def check_osds_exist(): 9 | code = 'ECLS1' 10 | msg = 'There are no OSDs available' 11 | osd_count = len(metadata['osds'].keys()) 12 | if not osd_count: 13 | return code, msg 14 | 15 | 16 | def check_nearfull(): 17 | """ 18 | Checks if the osd capacity is at nearfull 19 | """ 20 | code = 'ECLS2' 21 | msg = 'Cluster is nearfull' 22 | try: 23 | osd_map = metadata['cluster']['status']['osdmap']['osdmap'] 24 | except KeyError: 25 | return 26 | if osd_map.get('nearfull'): 27 | return code, msg 28 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = python -msphinx 7 | SPHINXPROJ = ceph-medic 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /ceph_medic/util/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | def str_to_int(string): 3 | """ 4 | Parses a string number into an integer, optionally converting to a float 5 | and rounding down. 6 | 7 | Some LVM values may come with a comma instead of a dot to define decimals. 8 | This function normalizes a comma into a dot 9 | """ 10 | try: 11 | integer = float(string.replace(',', '.')) 12 | except AttributeError: 13 | # this might be a integer already, so try to use it, otherwise raise 14 | # the original exception 15 | if isinstance(string, (int, float)): 16 | integer = string 17 | else: 18 | raise 19 | 20 | return int(integer) 21 | -------------------------------------------------------------------------------- /docs/source/_templates/smarttoc.html: -------------------------------------------------------------------------------- 1 | {# 2 | Sphinx sidebar template: smart table of contents. 3 | 4 | Shows a sidebar ToC that gives you a more global view of the 5 | documentation, and not the confusing cur/prev/next which is the 6 | default sidebar. 7 | 8 | The ToC will open and collapse automatically to show the part of the 9 | hierarchy you are in. Top-level items will always be visible. 10 | 11 | #} 12 |

{{ _('Table Of Contents') }}

13 | {{ toctree(maxdepth=-1) }} 14 | 15 | 17 | -------------------------------------------------------------------------------- /ceph_medic/log.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | import logging 3 | import os 4 | 5 | BASE_FORMAT = "[%(name)s][%(levelname)-6s] %(message)s" 6 | FILE_FORMAT = "[%(asctime)s]" + BASE_FORMAT 7 | 8 | 9 | def setup(config=None): 10 | root_logger = logging.getLogger() 11 | log_path = config.get_safe('global', '--log-path', '.') 12 | if not os.path.exists(log_path): 13 | raise RuntimeError('configured ``--log-path`` value does not exist: %s' % log_path) 14 | date = datetime.strftime(datetime.utcnow(), '%Y-%m-%d') 15 | log_file = os.path.join(log_path, 'ceph-medic-%s.log' % date) 16 | 17 | root_logger.setLevel(logging.DEBUG) 18 | 19 | # File Logger 20 | fh = logging.FileHandler(log_file) 21 | fh.setLevel(logging.DEBUG) 22 | fh.setFormatter(logging.Formatter(FILE_FORMAT)) 23 | 24 | root_logger.addHandler(fh) 25 | -------------------------------------------------------------------------------- /debian/changelog: -------------------------------------------------------------------------------- 1 | ceph-medic (1.0.8) stable; urgency=medium 2 | 3 | * New upstream release 4 | 5 | -- Ceph Release Team Wed, 17 Jun 2020 16:15:00 -0600 6 | 7 | ceph-medic (1.0.7) stable; urgency=medium 8 | 9 | * New upstream release 10 | 11 | -- Ceph Release Team Tue, 24 Mar 2020 17:29:00 -0600 12 | 13 | ceph-medic (1.0.6) stable; urgency=medium 14 | 15 | * New upstream release 16 | 17 | -- Ceph Release Team Tue, 11 Feb 2020 16:41:07 -0600 18 | 19 | ceph-medic (1.0.4) stable; urgency=medium 20 | 21 | * New upstream release 22 | 23 | -- Ceph Release Team Tue, 27 Mar 2018 20:19:38 +0000 24 | 25 | ceph-medic (0.0.1-1) unstable; urgency=medium 26 | 27 | * Initial release. 28 | 29 | -- Ken Dreyer Wed, 28 Jun 2017 13:20:07 -0600 30 | -------------------------------------------------------------------------------- /debian/control: -------------------------------------------------------------------------------- 1 | Source: ceph-medic 2 | Maintainer: Alfredo Deza 3 | Section: admin 4 | Priority: optional 5 | Build-Depends: 6 | debhelper (>= 7), 7 | dh-python, 8 | python, 9 | python-mock, 10 | python-pytest, 11 | python-remoto, 12 | python-setuptools, 13 | python-tambo 14 | X-Python-Version: >= 2.7 15 | Standards-Version: 3.9.7 16 | Homepage: http://ceph.com/ 17 | Vcs-Git: git://github.com/ceph/ceph-medic.git 18 | Vcs-Browser: https://github.com/ceph/ceph-medic 19 | 20 | Package: ceph-medic 21 | Architecture: all 22 | Depends: ${misc:Depends}, ${python:Depends} 23 | Description: determine common issues on Ceph storage clusters 24 | ceph-medic is a very simple tool to run against a Ceph cluster to detect 25 | common issues that might prevent correct functionality. It requires 26 | non-interactive SSH access to accounts that can sudo without a password 27 | prompt. 28 | -------------------------------------------------------------------------------- /ceph_medic/remote/util.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | 4 | 5 | def which(executable): 6 | """find the location of an executable""" 7 | locations = ( 8 | '/usr/local/bin', 9 | '/bin', 10 | '/usr/bin', 11 | '/usr/local/sbin', 12 | '/usr/sbin', 13 | '/sbin', 14 | ) 15 | 16 | for location in locations: 17 | executable_path = os.path.join(location, executable) 18 | if os.path.exists(executable_path): 19 | return executable_path 20 | 21 | 22 | def run(command): 23 | """ 24 | run a command, return stdout, stderr, and exit code. 25 | """ 26 | process = subprocess.Popen( 27 | command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True 28 | ) 29 | stdout = process.stdout.read().splitlines() 30 | stderr = process.stderr.read().splitlines() 31 | returncode = process.wait() 32 | 33 | return stdout, stderr, returncode 34 | -------------------------------------------------------------------------------- /ceph_medic/__init__.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | 3 | 4 | class UnloadedConfig(object): 5 | """ 6 | This class is used as the default value for config.ceph so that if 7 | a configuration file is not successfully loaded then it will give 8 | a nice error message when values from the config are used. 9 | """ 10 | def __init__(self, error=None): 11 | self.error = error 12 | 13 | def __getattr__(self, *a): 14 | raise RuntimeError(self.error) 15 | 16 | 17 | config = namedtuple('config', ['verbosity', 'nodes', 'hosts_file', 'file', 'cluster_name']) 18 | config.file = UnloadedConfig("No valid ceph-medic configuration file was loaded") 19 | config.nodes = {} 20 | 21 | metadata = {'failed_nodes': {}, 'rgws': {}, 'mgrs': {}, 'mdss': {}, 'clients': {}, 'osds': {}, 'mons': {}, 'nodes': {}, 'cluster': {}} 22 | 23 | daemon_types = [i for i in metadata.keys() if i not in ('nodes', 'failed_nodes', 'cluster')] 24 | 25 | __version__ = '1.0.8' 26 | -------------------------------------------------------------------------------- /docs/source/codes/mons.rst: -------------------------------------------------------------------------------- 1 | Monitors 2 | ======== 3 | 4 | The following checks indicate issues with monitor nodes. 5 | 6 | Errors 7 | ------ 8 | 9 | .. _EMON1: 10 | 11 | EMON1 12 | _____ 13 | The secret key used in the keyring differs from other nodes in the cluster. 14 | 15 | Warnings 16 | -------- 17 | 18 | 19 | .. _WMON1: 20 | 21 | WMON1 22 | _____ 23 | Multiple monitor directories are found on the same host. 24 | 25 | .. _WMON2: 26 | 27 | WMON2 28 | _____ 29 | Collocated OSDs in monitor nodes were found on the same host. 30 | 31 | .. _WMON3: 32 | 33 | WMON3 34 | _____ 35 | The recommended number of Monitor nodes is 3 for a high availability setup. 36 | 37 | .. _WMON4: 38 | 39 | WMON4 40 | _____ 41 | It is recommended to have an odd number of monitors so that failures can be 42 | tolerated. 43 | 44 | 45 | .. _WMON5: 46 | 47 | WMON5 48 | _____ 49 | Having a single monitor is not recommneded, as a failure would cause data loss. 50 | For high availability, at least 3 monitors is recommended. 51 | -------------------------------------------------------------------------------- /ceph_medic/tests/test_terminal.py: -------------------------------------------------------------------------------- 1 | from ceph_medic import terminal 2 | 3 | 4 | class FakeWriter(object): 5 | 6 | def __init__(self): 7 | self.calls = [] 8 | 9 | def write(self, string): 10 | self.calls.append(string) 11 | 12 | def flush(self): 13 | pass 14 | 15 | 16 | class TestWriteClearLine(object): 17 | 18 | def setup(self): 19 | self.fake_writer = FakeWriter() 20 | self.loader = terminal._Write( 21 | _writer=self.fake_writer, 22 | prefix='\r', 23 | clear_line=True 24 | ) 25 | 26 | def test_adds_padding_for_81_chars(self): 27 | self.loader.write('1234567890') 28 | assert len(self.fake_writer.calls[0]) == 81 29 | 30 | def test_remaining_padding_is_whitespace(self): 31 | self.loader.write('1234567890') 32 | assert self.fake_writer.calls[0][11:] == ' ' * 70 33 | 34 | def test_long_line_adds_only_ten_chars(self): 35 | self.loader.write('1'*81) 36 | assert self.fake_writer.calls[0][82:] == ' ' * 10 37 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2016, Red Hat, Inc. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /ceph_medic/tests/checks/test_cluster.py: -------------------------------------------------------------------------------- 1 | from ceph_medic.checks import cluster 2 | from ceph_medic import metadata 3 | 4 | 5 | class TestCheckOSDs(object): 6 | 7 | def setup(self): 8 | metadata['cluster_name'] = 'ceph' 9 | metadata['osds'] = {} 10 | 11 | def teardown(self): 12 | metadata['osds'] = {} 13 | 14 | def test_no_osds(self): 15 | assert cluster.check_osds_exist() == ('ECLS1', 'There are no OSDs available') 16 | 17 | def test_osds_are_found(self): 18 | metadata['osds'] = {'osd1': {}} 19 | assert cluster.check_osds_exist() is None 20 | 21 | class TestNearfull(object): 22 | 23 | def setup(self): 24 | metadata['cluster'] = {} 25 | 26 | def teardown(self): 27 | metadata['cluster'] = {} 28 | 29 | def test_key_error_is_ignored(self): 30 | assert cluster.check_nearfull() is None 31 | def test_osd_map_is_nearfull(self): 32 | metadata['cluster'] = {'status': {'osdmap': {'osdmap': {'nearfull': True}}}} 33 | assert cluster.check_nearfull() == ('ECLS2', 'Cluster is nearfull') 34 | def test_osd_map_is_not_nearfull(self): 35 | metadata['cluster'] = {'status': {'osdmap': {'osdmap': {'nearfull': False}}}} 36 | -------------------------------------------------------------------------------- /ceph-medic.spec.in: -------------------------------------------------------------------------------- 1 | # 2 | # spec file for package ceph-medic 3 | # 4 | %global commit @COMMIT@ 5 | %global shortcommit %(c=%{commit}; echo ${c:0:7}) 6 | 7 | Name: ceph-medic 8 | Version: @VERSION@ 9 | Release: @RELEASE@%{?dist} 10 | Summary: Find common issues on Ceph clusters 11 | License: MIT 12 | URL: https://github.com/ceph/ceph-medic 13 | Source0: %{name}-%{version}-%{shortcommit}.tar.gz 14 | BuildRequires: python-devel 15 | BuildRequires: python-setuptools 16 | BuildRequires: pytest 17 | BuildRequires: python-remoto 18 | BuildRequires: python-mock 19 | BuildRequires: python-tambo 20 | Requires: python-remoto 21 | Requires: python-tambo 22 | Requires: python-execnet 23 | 24 | BuildArch: noarch 25 | 26 | 27 | %description 28 | An admin tool to determine common issues on Ceph storage clusters. 29 | 30 | %prep 31 | %autosetup -p1 32 | 33 | %build 34 | python setup.py build 35 | 36 | %install 37 | python setup.py install -O1 --skip-build --root %{buildroot} 38 | 39 | %check 40 | export PYTHONPATH=$(pwd) 41 | 42 | py.test-%{python_version} -v ceph_medic/tests 43 | 44 | %files 45 | %license LICENSE 46 | %doc README.rst 47 | %{_bindir}/ceph-medic 48 | %{python_sitelib}/* 49 | 50 | %changelog 51 | -------------------------------------------------------------------------------- /ceph_medic/tests/test_main.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import ceph_medic.main 3 | 4 | from mock import patch 5 | 6 | 7 | class TestMain(object): 8 | def test_main(self): 9 | assert ceph_medic.main 10 | 11 | def test_invalid_ssh_config(self, capsys): 12 | argv = ["ceph-medic", "--ssh-config", "/does/not/exist"] 13 | with pytest.raises(SystemExit): 14 | ceph_medic.main.Medic(argv) 15 | out, _ = capsys.readouterr() 16 | assert 'the given ssh config path does not exist' in out 17 | 18 | def test_valid_ssh_config(self, capsys): 19 | ssh_config = '/etc/ssh/ssh_config' 20 | argv = ["ceph-medic", "--ssh-config", ssh_config] 21 | 22 | def fake_exists(path): 23 | if path == ssh_config: 24 | return True 25 | if path.endswith('cephmedic.conf'): 26 | return False 27 | return True 28 | 29 | with patch.object(ceph_medic.main.os.path, 'exists') as m_exists: 30 | m_exists.side_effect = fake_exists 31 | ceph_medic.main.Medic(argv) 32 | out, _ = capsys.readouterr() 33 | assert 'ssh config path does not exist' not in out 34 | assert ssh_config == ceph_medic.main.ceph_medic.config.ssh_config 35 | -------------------------------------------------------------------------------- /docs/source/codes/osds.rst: -------------------------------------------------------------------------------- 1 | OSDs 2 | ==== 3 | 4 | The following checks indicate issues with OSD nodes. 5 | 6 | Warnings 7 | -------- 8 | 9 | 10 | .. _WOSD1: 11 | 12 | WOSD1 13 | ^^^^^ 14 | Multiple ceph_fsid values found in /var/lib/ceph/osd. 15 | 16 | This might mean you are hosting OSDs for many clusters on 17 | this node or that some OSDs are misconfigured to join the 18 | clusters you expect. 19 | 20 | .. _WOSD2: 21 | 22 | WOSD2 23 | ^^^^^ 24 | Setting ``osd pool default min size = 1`` can lead to data loss because if the 25 | minimum is not met, Ceph will not acknowledge the write to the client. 26 | 27 | .. _WOSD3: 28 | 29 | WOSD3 30 | ^^^^^ 31 | The default value of 3 OSD nodes for a healthy cluster must be met. If 32 | ``ceph.conf`` is configured to a different number, that setting will take 33 | precedence. The number of OSD nodes is calculated by adding 34 | ``osd_pool_default_size`` and ``osd_pool_default_min_size`` + 1. By default, 35 | this adds to 3. 36 | 37 | .. _WOSD4: 38 | 39 | WOSD4 40 | ^^^^^ 41 | If ratios have been modified from its defaults, a warning is raised pointing to 42 | any ratio that diverges. The ratios observed with their defaults are: 43 | 44 | * ``backfillfull_ratio``: 0.9 45 | * ``nearfull_ratio``: 0.85 46 | * ``full_ratio``: 0.95 47 | 48 | -------------------------------------------------------------------------------- /ceph_medic/loader.py: -------------------------------------------------------------------------------- 1 | """ 2 | JSON Loading utilities 3 | """ 4 | import os 5 | import imp 6 | 7 | 8 | def load_config(filepath, **kw): 9 | ''' 10 | Creates a configuration dictionary from a file. 11 | 12 | :param filepath: The path to the file. 13 | ''' 14 | 15 | abspath = os.path.abspath(os.path.expanduser(filepath)) 16 | conf_dict = {} 17 | if not os.path.isfile(abspath): 18 | raise RuntimeError('`%s` is not a file.' % abspath) 19 | 20 | # First, make sure the code will actually compile (and has no SyntaxErrors) 21 | with open(abspath, 'rb') as f: 22 | compiled = compile(f.read(), abspath, 'exec') 23 | 24 | # Next, attempt to actually import the file as a module. 25 | # This provides more verbose import-related error reporting than exec() 26 | absname, _ = os.path.splitext(abspath) 27 | basepath, module_name = absname.rsplit(os.sep, 1) 28 | try: 29 | imp.load_module( 30 | module_name, 31 | *imp.find_module(module_name, [basepath]) 32 | ) 33 | except ImportError: 34 | pass 35 | 36 | # If we were able to import as a module, actually exec the compiled code 37 | exec(compiled, globals(), conf_dict) 38 | conf_dict['__file__'] = abspath 39 | return conf_dict 40 | -------------------------------------------------------------------------------- /debian/copyright: -------------------------------------------------------------------------------- 1 | Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0 2 | Upstream-Name: ceph-medic 3 | Source: https://github.com/ceph/ceph-medic 4 | 5 | Files: * 6 | Copyright: (c) 2016 by Red Hat Inc. 7 | License: Expat 8 | Permission is hereby granted, free of charge, to any person obtaining a copy 9 | of this software and associated documentation files (the "Software"), to deal 10 | in the Software without restriction, including without limitation the rights 11 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | copies of the Software, and to permit persons to whom the Software is 13 | furnished to do so, subject to the following conditions: 14 | . 15 | The above copyright notice and this permission notice shall be included in 16 | all copies or substantial portions of the Software. 17 | . 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 24 | THE SOFTWARE. 25 | -------------------------------------------------------------------------------- /ceph_medic/tests/test_log.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | from ceph_medic.util import configuration 4 | from ceph_medic import log 5 | import logging 6 | 7 | 8 | class TestLogSetup(object): 9 | 10 | def teardown(self): 11 | logger = logging.getLogger() 12 | logger.handlers = [] 13 | 14 | def test_barf_when_config_path_does_not_exist(self, tmpdir): 15 | location = os.path.join(str(tmpdir), 'ceph-medic.conf') 16 | with open(location, 'w') as _f: 17 | _f.write("""\n[global]\n--log-path=/bogus/path""") 18 | config = configuration.load(location) 19 | with pytest.raises(RuntimeError) as error: 20 | log.setup(config) 21 | assert 'value does not exist' in str(error.value) 22 | 23 | def test_create_log_config_correctly(self, tmpdir): 24 | tmp_log_path = str(tmpdir) 25 | location = os.path.join(tmp_log_path, 'ceph-medic.conf') 26 | with open(location, 'w') as _f: 27 | _f.write("""\n[global]\n--log-path=%s""" % tmp_log_path) 28 | config = configuration.load(location) 29 | log.setup(config) 30 | logger = logging.getLogger() 31 | # tox has its own logger now, we need to make sure we are talking about the 32 | # actual configured ones by ceph-medic 33 | ceph_medic_loggers = [ 34 | i for i in logger.handlers if 'ceph-medic' in getattr(i, 'baseFilename', '') 35 | ] 36 | assert len(ceph_medic_loggers) == 1 37 | -------------------------------------------------------------------------------- /tests/functional/tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = {ansible2.2,ansible2.3,ansible2.4}-{nightly_centos7} 3 | skipsdist = True 4 | 5 | [testenv] 6 | whitelist_externals = 7 | vagrant 8 | bash 9 | git 10 | passenv=* 11 | setenv= 12 | ANSIBLE_SSH_ARGS = -F {changedir}/vagrant_ssh_config 13 | ansible2.2: ANSIBLE_STDOUT_CALLBACK = debug 14 | ANSIBLE_RETRY_FILES_ENABLED = False 15 | ANSIBLE_SSH_RETRIES = 5 16 | ANSIBLE_ACTION_PLUGINS = {envdir}/tmp/ceph-ansible/plugins/actions 17 | deps= 18 | ansible1.9: ansible==1.9.4 19 | ansible2.1: ansible==2.1 20 | ansible2.2: ansible==2.2.3 21 | ansible2.3: ansible==2.3.1 22 | ansible2.4: ansible==2.4.2 23 | notario>=0.0.13 24 | changedir= 25 | nightly_centos7: {toxinidir}/centos7 26 | commands= 27 | git clone -b {env:CEPH_ANSIBLE_BRANCH:master} --single-branch https://github.com/ceph/ceph-ansible.git {envdir}/tmp/ceph-ansible 28 | 29 | vagrant up --no-provision {posargs:--provider=virtualbox} 30 | bash {toxinidir}/scripts/generate_ssh_config.sh {changedir} 31 | 32 | # install ceph-medic on 'client0' vm and setup nodes for testing 33 | ansible-playbook -vv -i {changedir}/hosts {toxinidir}/playbooks/setup.yml --extra-vars="ceph_medic_branch={env:CEPH_MEDIC_DEV_BRANCH:master}" 34 | # use ceph-ansible to deploy a ceph cluster on the rest of the vms 35 | ansible-playbook -vv -i {changedir}/hosts {envdir}/tmp/ceph-ansible/site.yml.sample 36 | # use ceph-medic to check the cluster we just created 37 | ansible-playbook -vv -i {changedir}/hosts {changedir}/test.yml 38 | 39 | vagrant destroy --force 40 | -------------------------------------------------------------------------------- /docs/source/changelog.rst: -------------------------------------------------------------------------------- 1 | 1.0.8 2 | ----- 3 | 17-Jun-2020 4 | 5 | * Fix issues with podman support 6 | 7 | 1.0.7 8 | ----- 9 | 24-Mar-2020 10 | 11 | * Fix test bugs that were breaking rpm builds 12 | 13 | 1.0.6 14 | ----- 15 | 11-Feb-2020 16 | 17 | * Docker, podman container support 18 | * Fix broken SSH config option 19 | * Fix querying the Ceph version via admin socket on newer Ceph versions 20 | 21 | 1.0.5 22 | ----- 23 | 27-Jun-2019 24 | 25 | * Add check for minimum OSD node count 26 | * Add check for minimum MON node count 27 | * Remove reporting of nodes that can't connect, report them separetely 28 | * Kubernetes, Openshift, container support 29 | * Fix unidentifiable user/group ID issues 30 | * Rook support 31 | * Report on failed nodes 32 | * When there are errors, set a non-zero exit status 33 | * Add separate "cluster wide" checks, which run once 34 | * Be able to retrieve socket configuration 35 | * Fix issue with trying to run ``whoami`` to test remote connections, use 36 | ``true`` instead 37 | * Add check for missing FSID 38 | * Skip OSD validation when there isn't any ceph.conf 39 | * Skip tmp directories in /var/lib/ceph scanning to prevent blowing up 40 | * Detect collocated daemons 41 | * Allow overriding ignores in the CLI, fallback to the config file 42 | * Break documentation up to have installation away from getting started 43 | 44 | 45 | 1.0.4 46 | ----- 47 | 20-Aug-2018 48 | 49 | * Add checks for parity between installed and socket versions 50 | * Fix issues with loading configuration with whitespace 51 | * Add check for min_pool_size 52 | * Collect versions from running daemons 53 | -------------------------------------------------------------------------------- /CONTRIBUTING.rst: -------------------------------------------------------------------------------- 1 | Contributing to ceph-medic 2 | =========================== 3 | Before any contributions, a reference ticket *must* exist. To open a new 4 | issue, requests can go to: 5 | 6 | https://github.com/ceph/ceph-medic/issues/new 7 | 8 | commits 9 | ------- 10 | Once a ticket exists, commits should be prefaced by the ticket ID. This makes 11 | it easier for maintainers to keep track of why a given line changed, mapping 12 | directly to work done on a ticket. 13 | 14 | For tickets coming from tracker.ceph.com, we expect the following format:: 15 | 16 | [RM-0000] this is a commit message for tracker.ceph.com 17 | 18 | ``RM`` stands for Redmine which is the software running tracker.ceph.com. 19 | Similarly, if a ticket was created in bugzilla.redhat.com, we expect the 20 | following format:: 21 | 22 | [BZ-0000] this is a commit message for bugzilla.redhat.com 23 | 24 | 25 | To automate this process, you can create a branch with the tracker identifier 26 | and id (replace "0000" with the ticket number):: 27 | 28 | git checkout -b RM-0000 29 | 30 | And then use the follow prepare-commit-msg: 31 | https://gist.github.com/alfredodeza/6d62d99a95c9a7975fbe 32 | 33 | Copy that file to ``$GITREPOSITORY/.git/hooks/prepare-commit-msg`` 34 | and mark it executable. 35 | 36 | Your commit messages should then be automatically prefixed with the branch name 37 | based off of the issue tracker. 38 | 39 | tests and documentation 40 | ----------------------- 41 | Wherever it is feasible, tests must exist and documentation must be added or 42 | improved depending on the change. 43 | 44 | The build process not only runs tests but ensures that docs can be built from 45 | the proposed changes as well. 46 | -------------------------------------------------------------------------------- /tests/functional/playbooks/setup.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: all 3 | gather_facts: True 4 | tasks: 5 | - name: write all nodes to /etc/hosts 6 | sudo: yes 7 | blockinfile: 8 | dest: /etc/hosts 9 | block: | 10 | {{ hostvars[item]["address"] }} {{ item }} 11 | marker: "# {mark} ANSIBLE MANAGED BLOCK {{ item }}" 12 | with_inventory_hostnames: all 13 | 14 | - hosts: medic 15 | become: yes 16 | tasks: 17 | 18 | - name: fetch shaman ceph-medic repo 19 | get_url: 20 | url: https://shaman.ceph.com/api/repos/ceph-medic/{{ ceph_medic_branch }}/latest/centos/7/repo 21 | dest: /etc/yum.repos.d/ceph-medic.repo 22 | 23 | - name: print contents of /etc/yum.repos.d/ceph-medic.repo 24 | command: cat /etc/yum.repos.d/ceph-medic.repo 25 | 26 | - name: install epel-release 27 | yum: 28 | name: epel-release 29 | state: present 30 | 31 | - name: install python-tambo 32 | yum: 33 | name: python-tambo 34 | state: present 35 | enablerepo: epel-testing 36 | 37 | - name: install ceph-medic 38 | yum: 39 | name: ceph-medic 40 | state: present 41 | 42 | - name: test ceph-medic install 43 | become: no 44 | command: ceph-medic --help 45 | 46 | - name: copy vagrant insecure private ssh key 47 | copy: 48 | src: ~/.vagrant.d/insecure_private_key 49 | dest: /home/vagrant/.ssh/id_dsa 50 | mode: 0600 51 | owner: vagrant 52 | group: vagrant 53 | 54 | - name: turn off StrictHostKeyChecking 55 | blockinfile: 56 | dest: /home/vagrant/.ssh/config 57 | create: yes 58 | mode: 0400 59 | owner: vagrant 60 | group: vagrant 61 | block: | 62 | Host * 63 | StrictHostKeyChecking no 64 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for constructing RPMs. 2 | # Try "make" (for SRPMS) or "make rpm" 3 | 4 | NAME = ceph-medic 5 | 6 | # Set the RPM package NVR from "git describe". 7 | # Examples: 8 | # 9 | # A "git describe" value of "v2.2.0rc1" would create an NVR 10 | # "ceph-medic-2.2.0-0.rc1.1.el7" 11 | # 12 | # A "git describe" value of "v2.2.0rc1-1-gc465f85" would create an NVR 13 | # "ceph-medic-2.2.0-0.rc1.1.gc465f85.el7" 14 | # 15 | # A "git describe" value of "v2.2.0" creates an NVR 16 | # "ceph-medic-2.2.0-1.el7" 17 | 18 | VERSION := $(shell git describe --tags --abbrev=0 --match 'v*' | sed 's/^v//') 19 | COMMIT := $(shell git rev-parse HEAD) 20 | SHORTCOMMIT := $(shell echo $(COMMIT) | cut -c1-7) 21 | RELEASE := $(shell git describe --tags --match 'v*' \ 22 | | sed 's/^v//' \ 23 | | sed 's/^[^-]*-//' \ 24 | | sed 's/-/./') 25 | ifeq ($(VERSION),$(RELEASE)) 26 | RELEASE = 1 27 | endif 28 | ifneq (,$(findstring rc,$(VERSION))) 29 | RC := $(shell echo $(VERSION) | sed 's/.*rc/rc/') 30 | RELEASE := 0.$(RC).$(RELEASE) 31 | VERSION := $(subst $(RC),,$(VERSION)) 32 | endif 33 | NVR := $(NAME)-$(VERSION)-$(RELEASE).el7 34 | 35 | all: srpm 36 | 37 | # Testing only 38 | echo: 39 | echo COMMIT $(COMMIT) 40 | echo VERSION $(VERSION) 41 | echo RELEASE $(RELEASE) 42 | echo NVR $(NVR) 43 | 44 | clean: 45 | rm -rf dist/ 46 | rm -rf ceph-medic-$(VERSION)-$(SHORTCOMMIT).tar.gz 47 | rm -rf $(NVR).src.rpm 48 | 49 | dist: 50 | git archive --format=tar.gz --prefix=ceph-medic-$(VERSION)/ HEAD > ceph-medic-$(VERSION)-$(SHORTCOMMIT).tar.gz 51 | 52 | spec: 53 | sed ceph-medic.spec.in \ 54 | -e 's/@COMMIT@/$(COMMIT)/' \ 55 | -e 's/@VERSION@/$(VERSION)/' \ 56 | -e 's/@RELEASE@/$(RELEASE)/' \ 57 | > ceph-medic.spec 58 | 59 | srpm: dist spec 60 | fedpkg -v --dist epel7 srpm 61 | 62 | rpm: dist srpm 63 | mock -r epel-7-x86_64 rebuild $(NVR).src.rpm \ 64 | --resultdir=. \ 65 | --define "dist .el7" 66 | 67 | .PHONY: dist rpm srpm 68 | -------------------------------------------------------------------------------- /docs/source/codes/common.rst: -------------------------------------------------------------------------------- 1 | Common 2 | ====== 3 | The following checks indiciate general issues with the cluster that are not specific to any daemon type. 4 | 5 | Warnings 6 | -------- 7 | 8 | .. _WCOM1: 9 | 10 | WCOM1 11 | ^^^^^ 12 | A running OSD and MON daemon were detected in the same node. Colocating OSDs and MONs is highly discouraged. 13 | 14 | 15 | Errors 16 | ------ 17 | 18 | .. _ECOM1: 19 | 20 | ECOM1 21 | ^^^^^ 22 | A ceph configuration file cannot be found at ``/etc/ceph/$cluster-name.conf``. 23 | 24 | .. _ECOM2: 25 | 26 | ECOM2 27 | ^^^^^ 28 | The ``ceph`` executable was not found. 29 | 30 | .. _ECOM3: 31 | 32 | ECOM3 33 | ^^^^^ 34 | The ``/var/lib/ceph`` directory does not exist or could not be collected. 35 | 36 | .. _ECOM4: 37 | 38 | ECOM4 39 | ^^^^^ 40 | The ``/var/lib/ceph`` directory was not owned by the ``ceph`` user. 41 | 42 | .. _ECOM5: 43 | 44 | ECOM5 45 | ^^^^^ 46 | The ``fsid`` defined in the configuration differs from other nodes in the cluster. The ``fsid`` must be 47 | the same for all nodes in the cluster. 48 | 49 | .. _ECOM6: 50 | 51 | ECOM6 52 | ^^^^^ 53 | The installed version of ``ceph`` is not the same for all nodes in the cluster. The ``ceph`` version should be 54 | the same for all nodes in the cluster. 55 | 56 | .. _ECOM7: 57 | 58 | ECOM7 59 | ^^^^^ 60 | The installed version of ``ceph`` is not the same as the one of a running ceph daemon. The installed ``ceph`` version should be the same as all running ceph daemons. If they do not match, the daemons most likely have not been restarted correctly after a version change. 61 | 62 | .. _ECOM8: 63 | 64 | ECOM8 65 | ^^^^^ 66 | The ``fsid`` field must exist in the configuration for each node. 67 | 68 | 69 | .. _ECOM9: 70 | 71 | ECOM9 72 | ^^^^^ 73 | A cluster should not have running daemons with a cluster ``fsid`` that is different from the rest of the daemons in a cluster. This potentially means that different cluster identifiers are being used, and that should not be the case. 74 | 75 | 76 | .. _ECOM10: 77 | 78 | ECOM10 79 | ^^^^^^ 80 | Only a single monitor daemon shuld be running per host, having more than one monitor running on the same host reduces a cluster's resilience if the node goes down. 81 | -------------------------------------------------------------------------------- /ceph_medic/decorators.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from ceph_medic import terminal 4 | from functools import wraps 5 | 6 | 7 | def catches(catch=None, handler=None, exit=True): 8 | """ 9 | Very simple decorator that tries any of the exception(s) passed in as 10 | a single exception class or tuple (containing multiple ones) returning the 11 | exception message and optionally handling the problem if it rises with the 12 | handler if it is provided. 13 | 14 | So instead of douing something like this:: 15 | 16 | def bar(): 17 | try: 18 | some_call() 19 | print "Success!" 20 | except TypeError, exc: 21 | print "Error while handling some call: %s" % exc 22 | sys.exit(1) 23 | 24 | You would need to decorate it like this to have the same effect:: 25 | 26 | @catches(TypeError) 27 | def bar(): 28 | some_call() 29 | print "Success!" 30 | 31 | If multiple exceptions need to be catched they need to be provided as a 32 | tuple:: 33 | 34 | @catches((TypeError, AttributeError)) 35 | def bar(): 36 | some_call() 37 | print "Success!" 38 | """ 39 | catch = catch or Exception 40 | 41 | def decorate(f): 42 | 43 | @wraps(f) 44 | def newfunc(*a, **kw): 45 | try: 46 | return f(*a, **kw) 47 | except catch as e: 48 | if os.environ.get('CEPH_MEDIC_DEBUG'): 49 | raise 50 | if handler: 51 | return handler(e) 52 | else: 53 | sys.stderr.write(make_exception_message(e)) 54 | if exit: 55 | sys.exit(1) 56 | return newfunc 57 | 58 | return decorate 59 | 60 | # 61 | # Decorator helpers 62 | # 63 | 64 | 65 | def make_exception_message(exc): 66 | """ 67 | An exception is passed in and this function 68 | returns the proper string depending on the result 69 | so it is readable enough. 70 | """ 71 | if str(exc): 72 | return '%s %s: %s\n' % (terminal.red_arrow, exc.__class__.__name__, exc) 73 | else: 74 | return '%s %s\n' % (terminal.red_arrow, exc.__class__.__name__) 75 | -------------------------------------------------------------------------------- /ceph_medic/tests/remote/test_commands.py: -------------------------------------------------------------------------------- 1 | from mock import Mock 2 | from ceph_medic.remote import commands 3 | 4 | 5 | class TestCephSocketVersion(object): 6 | 7 | def test_gets_socket_version(self, monkeypatch): 8 | def mock_check(conn, cmd): 9 | return (['{"version":"12.2.0"}'], [], 0) 10 | monkeypatch.setattr(commands, 'check', mock_check) 11 | result = commands.ceph_socket_version(Mock(), '/var/run/ceph/osd.asok') 12 | assert 'version' in result 13 | 14 | def test_handles_invalid_json(self, monkeypatch): 15 | def mock_check(conn, cmd): 16 | return (['version=12.2.0'], [], 0) 17 | monkeypatch.setattr(commands, 'check', mock_check) 18 | result = commands.ceph_socket_version(Mock(), '/var/run/ceph/osd.asok') 19 | assert result == {} 20 | 21 | def test_handles_non_zero_code(self, monkeypatch): 22 | def mock_check(conn, cmd): 23 | return (['version=12.2.0'], [], 1) 24 | monkeypatch.setattr(commands, 'check', mock_check) 25 | result = commands.ceph_socket_version(Mock(), '/var/run/ceph/osd.asok') 26 | assert result == {} 27 | 28 | 29 | class TestCephVersion(object): 30 | 31 | def test_gets_ceph_version(self, stub_check): 32 | stub_check( 33 | (['ceph version 14.1.1 (nautilus)', ''], [], 0), 34 | commands, 'check') 35 | result = commands.ceph_version(None) 36 | assert result == 'ceph version 14.1.1 (nautilus)' 37 | 38 | def test_handles_non_zero_status(self, stub_check, conn): 39 | stub_check( 40 | (['error mr. robinson', ''], [], 1), 41 | commands, 'check') 42 | result = commands.ceph_version(conn) 43 | assert result is None 44 | 45 | 46 | class TestDaemonSocketConfig(object): 47 | 48 | def test_loadable_json(self, stub_check, conn): 49 | stub_check( 50 | (['{"config": true}'], [], 0), 51 | commands, 'check') 52 | result = commands.daemon_socket_config(conn, '/') 53 | assert result == {'config': True} 54 | 55 | def test_unloadable_json(self, stub_check, conn): 56 | stub_check( 57 | (['{config: []}'], [], 0), 58 | commands, 'check') 59 | result = commands.daemon_socket_config(conn, '/') 60 | assert result == {} 61 | 62 | -------------------------------------------------------------------------------- /tests/functional/centos7/vagrant_variables.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | # DEPLOY CONTAINERIZED DAEMONS 4 | docker: false 5 | 6 | # DEFINE THE NUMBER OF VMS TO RUN 7 | mon_vms: 1 8 | osd_vms: 1 9 | mds_vms: 0 10 | rgw_vms: 0 11 | nfs_vms: 0 12 | rbd_mirror_vms: 0 13 | client_vms: 1 14 | iscsi_gw_vms: 0 15 | 16 | # SUBNETS TO USE FOR THE VMS 17 | public_subnet: 192.168.3 18 | cluster_subnet: 192.168.4 19 | 20 | # MEMORY 21 | # set 1024 for CentOS 22 | memory: 512 23 | 24 | # Ethernet interface name 25 | # use eth1 for libvirt and ubuntu precise, enp0s8 for CentOS and ubuntu xenial 26 | eth: 'eth1' 27 | 28 | # VAGRANT BOX 29 | # Ceph boxes are *strongly* suggested. They are under better control and will 30 | # not get updated frequently unless required for build systems. These are (for 31 | # now): 32 | # 33 | # * ceph/ubuntu-xenial 34 | # 35 | # Ubuntu: ceph/ubuntu-xenial bento/ubuntu-16.04 or ubuntu/trusty64 or ubuntu/wily64 36 | # CentOS: bento/centos-7.1 or puppetlabs/centos-7.0-64-puppet 37 | # libvirt CentOS: centos/7 38 | # parallels Ubuntu: parallels/ubuntu-14.04 39 | # Debian: deb/jessie-amd64 - be careful the storage controller is named 'SATA Controller' 40 | # For more boxes have a look at: 41 | # - https://atlas.hashicorp.com/boxes/search?utf8=✓&sort=&provider=virtualbox&q= 42 | # - https://download.gluster.org/pub/gluster/purpleidea/vagrant/ 43 | vagrant_box: centos/7 44 | client_vagrant_box: centos/7 45 | #ssh_private_key_path: "~/.ssh/id_rsa" 46 | # The sync directory changes based on vagrant box 47 | # Set to /home/vagrant/sync for Centos/7, /home/{ user }/vagrant for openstack and defaults to /vagrant 48 | #vagrant_sync_dir: /home/vagrant/sync 49 | #vagrant_sync_dir: / 50 | # Disables synced folder creation. Not needed for testing, will skip mounting 51 | # the vagrant directory on the remote box regardless of the provider. 52 | vagrant_disable_synced_folder: true 53 | # VAGRANT URL 54 | # This is a URL to download an image from an alternate location. vagrant_box 55 | # above should be set to the filename of the image. 56 | # Fedora virtualbox: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box 57 | # Fedora libvirt: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-libvirt.box 58 | # vagrant_box_url: https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-Vagrant-22-20150521.x86_64.vagrant-virtualbox.box 59 | -------------------------------------------------------------------------------- /ceph_medic/util/mon.py: -------------------------------------------------------------------------------- 1 | import remoto 2 | import json 3 | import ceph_medic 4 | from ceph_medic import terminal 5 | 6 | 7 | def get_mon_report(conn): 8 | command = [ 9 | 'ceph', 10 | '--cluster=%s' % ceph_medic.metadata['cluster_name'], 11 | 'report' 12 | ] 13 | out, err, code = remoto.process.check( 14 | conn, 15 | command 16 | ) 17 | 18 | if code > 0: 19 | terminal.error('failed to connect to the cluster to fetch a report from the monitor') 20 | terminal.error('command: %s' % ' '.join(command)) 21 | for line in err: 22 | terminal.error(line) 23 | raise RuntimeError() 24 | 25 | try: 26 | return json.loads(b''.join(out).decode('utf-8')) 27 | except ValueError: 28 | return {} 29 | 30 | 31 | def get_cluster_nodes(conn): 32 | """ 33 | Ask a monitor (with a pre-made connection) about all the nodes in 34 | a cluster. This will be able to get us all known MONs and OSDs. 35 | 36 | It returns a dictionary with a mapping that looks like:: 37 | 38 | { 39 | 'mons': [ 40 | { 41 | 'host': 'node1', 42 | 'public_ip': '192.168.1.100', 43 | }, 44 | ], 45 | 'osds': [ 46 | { 47 | 'host': 'node2', 48 | 'public_ip': '192.168.1.101', 49 | }, 50 | { 51 | 'host': 'node3', 52 | 'public_ip': '192.168.1.102', 53 | }, 54 | ] 55 | } 56 | 57 | """ 58 | report = get_mon_report(conn) 59 | nodes = {'mons': [], 'osds': []} 60 | try: 61 | # XXX Is this really needed? in what case we wouldn't have a monmap 62 | # with mons? 63 | mons = report['monmap']['mons'] 64 | except KeyError: 65 | raise SystemExit(report) 66 | for i in mons: 67 | nodes['mons'].append({ 68 | 'host': i['name'], 69 | 'public_ip': _extract_ip_address(i['public_addr']) 70 | }) 71 | 72 | osds = report['osd_metadata'] 73 | for i in osds: 74 | nodes['osds'].append({ 75 | 'host': i['hostname'], 76 | 'public_ip': _extract_ip_address(i['front_addr']) 77 | }) 78 | 79 | return nodes 80 | 81 | 82 | # XXX does not support IPV6 83 | 84 | def _extract_ip_address(string): 85 | """ 86 | Addresses from Ceph reports can come up with subnets and ports using ':' 87 | and '/' to identify them properly. Parse those types of strings to extract 88 | just the IP. 89 | """ 90 | port_removed = string.split(':')[0] 91 | return port_removed.split('/')[0] 92 | -------------------------------------------------------------------------------- /ceph_medic/checks/osds.py: -------------------------------------------------------------------------------- 1 | from ceph_medic import metadata 2 | from ceph_medic.util import configuration 3 | 4 | 5 | # 6 | # Utilities 7 | # 8 | 9 | def get_osd_ceph_fsids(data): 10 | fsids = [] 11 | for file_path in data['paths']['/var/lib/ceph']['files'].keys(): 12 | if "ceph_fsid" in file_path: 13 | fsids.append(data['paths']['/var/lib/ceph']['files'][file_path]['contents'].strip()) 14 | return set(fsids) 15 | 16 | 17 | # XXX move out to a central utility module for other checks 18 | def get_ceph_conf(data): 19 | path = '/etc/ceph/%s.conf' % metadata['cluster_name'] 20 | try: 21 | conf_file = data['paths']['/etc/ceph']['files'][path] 22 | except KeyError: 23 | return None 24 | return configuration.load_string(conf_file['contents']) 25 | 26 | 27 | def check_osd_ceph_fsid(host, data): 28 | code = 'WOSD1' 29 | msg = "Multiple ceph_fsid values found: %s" 30 | 31 | current_fsids = get_osd_ceph_fsids(data) 32 | 33 | if len(current_fsids) > 1: 34 | return code, msg % ", ".join(current_fsids) 35 | 36 | 37 | def check_min_pool_size(host, data): 38 | code = 'WOSD2' 39 | msg = 'osd default pool min_size is set to 1, can potentially lose data' 40 | conf = get_ceph_conf(data) 41 | if not conf: # no ceph.conf found! 42 | return 43 | size = conf.get_safe('global', 'osd_pool_default_min_size', '0') 44 | if int(size) == 1: 45 | return code, msg 46 | 47 | 48 | def check_min_osd_nodes(host, data): 49 | code = 'WOSD3' 50 | msg = 'OSD nodes might not be enough for a healthy cluster (%s needed, %s found)' 51 | conf = get_ceph_conf(data) 52 | if not conf: # no ceph.conf found! 53 | return 54 | default_size = int(conf.get_safe('global', 'osd_pool_default_size', '3')) 55 | min_size = int(conf.get_safe('global', 'osd_pool_default_min_size', '0')) 56 | magical_number = default_size + min_size + 1 57 | osd_nodes = len(metadata['osds']) 58 | if magical_number > osd_nodes: 59 | return code, msg % (magical_number, osd_nodes) 60 | 61 | 62 | def check_reasonable_ratios(host, data): 63 | code = 'WOSD4' 64 | msg = 'Ratios have been modified to unreasonable values: %s' 65 | unreasonable_ratios = [] 66 | reasonable_ratios = { 67 | "backfillfull_ratio": 0.9, 68 | "nearfull_ratio": 0.85, 69 | "full_ratio": 0.95 70 | } 71 | 72 | dump = data['ceph']['osd'].get('dump', {}) 73 | for name, value in reasonable_ratios.items(): 74 | ratio = dump.get(name) 75 | if not ratio: 76 | continue 77 | if ratio != reasonable_ratios[name]: 78 | unreasonable_ratios.append(name) 79 | if unreasonable_ratios: 80 | msg = msg % ', '.join(sorted(unreasonable_ratios)) 81 | return code, msg 82 | -------------------------------------------------------------------------------- /docs/source/installation.rst: -------------------------------------------------------------------------------- 1 | Installation 2 | ============ 3 | 4 | ``ceph-medic`` supports a few different installation methods, including system 5 | packages for RPM distros via EPEL. For PyPI, it can be installed with:: 6 | 7 | pip install ceph-medic 8 | 9 | 10 | Official Upstream Repos 11 | ----------------------- 12 | 13 | Download official releases of ``ceph-medic`` at https://download.ceph.com/ceph-medic 14 | 15 | Currently, only RPM repos built for CentOS 7 are supported. 16 | 17 | ``ceph-medic`` has dependencies on packages found in EPEL, so EPEL will need to be enabled. 18 | 19 | Follow these steps to install a CentOS 7 repo from download.ceph.com: 20 | 21 | - Install the latest RPM repo from download.ceph.com:: 22 | 23 | wget http://download.ceph.com/ceph-medic/latest/rpm/el7/ceph-medic.repo -O /etc/yum.repos.d/ceph-medic.repo 24 | 25 | - Install ``epel-release``:: 26 | 27 | 28 | yum install epel-release 29 | 30 | - Install the GPG key for ``ceph-medic``:: 31 | 32 | wget https://download.ceph.com/keys/release.asc 33 | rpm --import release.asc 34 | 35 | - Install ``ceph-medic``:: 36 | 37 | yum install ceph-medic 38 | 39 | - Verify your install:: 40 | 41 | ceph-medic --help 42 | 43 | Shaman Repos 44 | ------------ 45 | 46 | Every branch pushed to ceph-medic.git gets a RPM repo created and stored at 47 | shaman.ceph.com. Currently, only RPM repos built for CentOS 7 are supported. 48 | 49 | Browse https://shaman.ceph.com/repos/ceph-medic to find the available repos. 50 | 51 | .. Note:: 52 | Shaman repos are available for 2 weeks before they are automatically deleted. 53 | However, there should always be a repo available for the master branch of ``ceph-medic``. 54 | 55 | ``ceph-medic`` has dependencies on packages found in EPEL, so EPEL will need to be enabled. 56 | 57 | Follow these steps to install a CentOS 7 repo from shaman.ceph.com: 58 | 59 | - Install the latest master shaman repo:: 60 | 61 | wget https://shaman.ceph.com/api/repos/ceph-medic/master/latest/centos/7/repo -O /etc/yum.repos.d/ceph-medic.repo 62 | 63 | - Install ``epel-release``:: 64 | 65 | yum install epel-release 66 | 67 | - Install ``ceph-medic``:: 68 | 69 | yum install ceph-medic 70 | 71 | - Verify your install:: 72 | 73 | ceph-medic --help 74 | 75 | GitHub 76 | ------ 77 | You can install directly from the source on GitHub by following these steps: 78 | 79 | - Clone the repository:: 80 | 81 | git clone https://github.com/ceph/ceph-medic.git 82 | 83 | 84 | - Change to the ``ceph-medic`` directory:: 85 | 86 | cd ceph-medic 87 | 88 | - Create and activate a Python Virtual Environment:: 89 | 90 | virtualenv venv 91 | source venv/bin/activate 92 | 93 | - Install ceph-medic into the Virtual Environment:: 94 | 95 | python setup.py install 96 | 97 | ``ceph-medic`` should now be installed and available in the created virtualenv. 98 | Check your installation by running: ``ceph-medic --help`` 99 | -------------------------------------------------------------------------------- /ceph_medic/check.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import ceph_medic 3 | import logging 4 | from ceph_medic import runner, collector 5 | from tambo import Transport 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | def as_list(string): 11 | if not string: 12 | return [] 13 | string = string.strip(',') 14 | 15 | # split on commas 16 | string = string.split(',') 17 | 18 | # strip spaces 19 | return [x.strip() for x in string] 20 | 21 | 22 | class Check(object): 23 | help = "Run checks for all the configured nodes in a cluster or hosts file" 24 | long_help = """ 25 | check: Run for all the configured nodes in the configuration 26 | 27 | Options: 28 | --ignore Comma-separated list of errors and warnings to ignore. 29 | 30 | 31 | Loaded Config Path: {config_path} 32 | 33 | Configured Nodes: 34 | {configured_nodes} 35 | """ 36 | 37 | def __init__(self, argv=None, parse=True): 38 | self.argv = argv or sys.argv 39 | 40 | @property 41 | def subcommand_args(self): 42 | # find where `check` is 43 | index = self.argv.index('check') 44 | # slice the args 45 | return self.argv[index:] 46 | 47 | def _help(self): 48 | node_section = [] 49 | for daemon, node in ceph_medic.config.nodes.items(): 50 | header = "\n* %s:\n" % daemon 51 | body = '\n'.join([" %s" % n for n in ceph_medic.config.nodes[daemon]]) 52 | node_section.append(header+body+'\n') 53 | return self.long_help.format( 54 | configured_nodes=''.join(node_section), 55 | config_path=ceph_medic.config.config_path 56 | ) 57 | 58 | def main(self): 59 | options = ['--ignore'] 60 | config_ignores = ceph_medic.config.file.get_list('check', '--ignore') 61 | parser = Transport( 62 | self.argv, options=options, 63 | check_version=False 64 | ) 65 | parser.catch_help = self._help() 66 | parser.parse_args() 67 | ignored_codes = as_list(parser.get('--ignore', '')) 68 | # fallback to the configuration if nothing is defined in the CLI 69 | if not ignored_codes: 70 | ignored_codes = config_ignores 71 | 72 | if len(self.argv) < 1: 73 | return parser.print_help() 74 | 75 | # populate the nodes metadata with the configured nodes 76 | for daemon in ceph_medic.config.nodes.keys(): 77 | ceph_medic.metadata['nodes'][daemon] = [] 78 | for daemon, nodes in ceph_medic.config.nodes.items(): 79 | for node in nodes: 80 | node_metadata = {'host': node['host']} 81 | if 'container' in node: 82 | node_metadata['container'] = node['container'] 83 | ceph_medic.metadata['nodes'][daemon].append(node_metadata) 84 | 85 | collector.collect() 86 | test = runner.Runner() 87 | test.ignore = ignored_codes 88 | results = test.run() 89 | runner.report(results) 90 | #XXX might want to make this configurable to not bark on warnings for 91 | # example, setting forcefully for now, but the results object doesn't 92 | # make a distinction between error and warning (!) 93 | if results.errors or results.warnings: 94 | sys.exit(1) 95 | -------------------------------------------------------------------------------- /docs/source/facts.rst: -------------------------------------------------------------------------------- 1 | Cluster node facts 2 | ================== 3 | Fact collection happens per node and creates a mapping of hosts and data 4 | gathered. Each daemon 'type' is the primary key:: 5 | 6 | ... 7 | 'osd': { 8 | 'node1': {...}, 9 | 'node2': {...}, 10 | } 11 | 'mon': { 12 | 'node3': {...}, 13 | } 14 | 15 | 16 | There are other top-level keys that make it easier to deal with fact metadata, for example a full list of all hosts discovered:: 17 | 18 | 'hosts': ['node1', 'node2', 'node3'], 19 | 'osds': ['node1', 'node2'], 20 | 'mons': ['node3'] 21 | 22 | 23 | Each host has distinct metadata that gets collected. If any errors are 24 | detected, the ``exception`` key is set populated with all information pertaining 25 | to the error generated when trying to execute the call. For example, a failed call to ``stat`` on a path might be:: 26 | 27 | 'osd': { 28 | 'node1': { 29 | 'paths': { 30 | '/var/lib/osd': { 31 | 'exception': { 32 | 'traceback': "Traceback (most recent call last):\n File "remote.py", line 3, in \n os.stat('/var/lib/osd')\n OSError: [Errno 2] No such file or directory: '/var/lib/osd'\n", 33 | 'name': 'OSError', 34 | 'repr': "[Errno 2] No such file or directory: '/root'" 35 | 'attributes': { 36 | args : "(2, 'No such file or directory')", 37 | errno : 2, 38 | filename : '/var/lib/ceph' , 39 | message : '', 40 | strerror : 'No such file or directory' 41 | } 42 | } 43 | } 44 | } 45 | } 46 | 47 | Note that objects will not get pickled, so data structures and objects will be 48 | sent back as plain text. 49 | 50 | Path contents are optionally enabled by the fact engine and will contain the 51 | raw representation of the full file contents. Here is an example of what 52 | a ``ceph.conf`` file would be in a monitor node:: 53 | 54 | 55 | 'mon': { 56 | 'node3': { 57 | 'paths': { 58 | '/etc/ceph/': { 59 | 'dirs': [], 60 | 'files': { 61 | '/etc/ceph/ceph.conf': { 62 | 'contents': "[global]\nfsid = f05294bd-6e9d-4883-9819-c2800d4d7962\nmon_initial_members = node3\nmon_host = 192.168.111.102\nauth_cluster_required = cephx\nauth_service_required = cephx\nauth_client_required = cephx\n", 63 | 'owner': 'ceph', 64 | 'group': 'ceph', 65 | 'n_fields' : 19 , 66 | 'n_sequence_fields' : 10 , 67 | 'n_unnamed_fields' : 3 , 68 | 'st_atime' : 1490714187.0 , 69 | 'st_birthtime' : 1463607160.0 , 70 | 'st_blksize' : 4096 , 71 | 'st_blocks' : 0 , 72 | 'st_ctime' : 1490295294.0 , 73 | 'st_dev' : 16777220 , 74 | 'st_flags' : 1048576 , 75 | 'st_gen' : 0 , 76 | 'st_gid' : 0 , 77 | 'st_ino' : 62858421 , 78 | 'st_mode' : 16877 , 79 | 'st_mtime' : 1490295294.0 , 80 | 'st_nlink' : 26 , 81 | 'st_rdev' : 0 , 82 | 'st_size' : 884 , 83 | 'st_uid' : 0 , 84 | 'exception': {}, 85 | } 86 | } 87 | } 88 | } 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /ceph_medic/generate.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import sys 3 | import ceph_medic 4 | from ceph_medic.connection import get_connection 5 | import remoto 6 | import json 7 | from tambo import Transport 8 | 9 | 10 | def generate_inventory(inventory, to_stdout=False, tmp_dir=None): 11 | """ 12 | Generates a host file to use with an ansible-playbook call. 13 | 14 | The first argument is a dictionary mapping that contains the group name as 15 | the key and a list of hostnames as values 16 | 17 | For example: 18 | 19 | {'mons': ['mon.host'], 'osds': ['osd1.host', 'osd1.host']} 20 | """ 21 | result = [] 22 | for section, hosts in inventory.items(): 23 | group_name = section 24 | result.append("[{0}]".format(group_name)) 25 | if not isinstance(hosts, list): 26 | hosts = [hosts] 27 | result.extend(hosts) 28 | result_str = "\n".join(result) + "\n" 29 | # if not None the NamedTemporaryFile will be created in the given directory 30 | if to_stdout: 31 | print(result_str) 32 | return 33 | with open('hosts_file', 'w') as hosts_file: 34 | hosts_file.write(result_str) 35 | 36 | 37 | def get_mon_report(conn): 38 | out, err, code = remoto.process.check( 39 | conn, 40 | [ 41 | 'ceph', 42 | 'report' 43 | ], 44 | ) 45 | 46 | if code > 0: 47 | for line in err: 48 | print(line) 49 | 50 | try: 51 | return json.loads(b''.join(out).decode('utf-8')) 52 | except ValueError: 53 | return {} 54 | 55 | 56 | class Generate(object): 57 | help = "Create a hosts file (Ansible compatible) from the information on a running Ceph cluster" 58 | long_help = """ 59 | Create a hosts file (Ansible compatible) from the information on a running Ceph 60 | cluster. 61 | 62 | Usage: 63 | 64 | ceph-medic generate [/path/to/ceph.conf] 65 | ceph-medic generate [MONITOR HOST] 66 | 67 | Loaded Config Path: {config_path} 68 | 69 | """ 70 | 71 | def __init__(self, argv=None, parse=True): 72 | self.argv = argv or sys.argv 73 | 74 | def _help(self): 75 | skip_internal = ['__file__', 'config_path', 'verbosity'] 76 | node_section = [] 77 | for daemon, node in ceph_medic.config['nodes'].items(): 78 | if daemon in skip_internal or not node: 79 | continue 80 | header = "\n* %s:\n" % daemon 81 | body = '\n'.join([" %s" % n for n in ceph_medic.config['nodes'][daemon].keys()]) 82 | node_section.append(header+body+'\n') 83 | return self.long_help.format( 84 | config_path=ceph_medic.config['config_path'] 85 | ) 86 | 87 | def main(self): 88 | options = ['--stdout'] 89 | parser = Transport( 90 | self.argv, options=options, 91 | check_version=False 92 | ) 93 | parser.catch_help = self._help() 94 | 95 | parser.parse_args() 96 | 97 | if len(self.argv) == 1: 98 | raise SystemExit("A monitor hostname or a ceph.conf file is required as an argument") 99 | 100 | node = self.argv[-1] 101 | inventory = {} 102 | 103 | with get_connection(node) as conn: 104 | report = get_mon_report(conn) 105 | try: 106 | mons = report['monmap']['mons'] 107 | except KeyError: 108 | raise SystemExit(report) 109 | inventory['mons'] = [i['name'] for i in mons] 110 | osds = report['osd_metadata'] 111 | inventory['osds'] = [i['hostname'] for i in osds] 112 | 113 | if not inventory: 114 | raise SystemExit('no hosts where found from remote monitor node: %s' % node) 115 | 116 | generate_inventory(inventory, to_stdout=parser.get('--stdout')) 117 | conn.exit() 118 | return 119 | -------------------------------------------------------------------------------- /ceph_medic/tests/util/test_configuration.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | from textwrap import dedent 4 | from ceph_medic.util import configuration 5 | 6 | 7 | def make_hosts_file(filename, contents=None): 8 | contents = contents or "[mons]\nmon0\n[osds]\nosd0\n" 9 | with open(filename, 'w') as f: 10 | f.write(contents) 11 | 12 | 13 | class TestFlatInventory(object): 14 | 15 | def test_parses_both_sections(self, tmpdir): 16 | filename = os.path.join(str(tmpdir), 'hosts') 17 | make_hosts_file(filename) 18 | result = configuration.AnsibleInventoryParser(filename) 19 | assert sorted(result.nodes.keys()) == sorted(['mons', 'osds']) 20 | 21 | def test_populates_hosts(self, tmpdir): 22 | filename = os.path.join(str(tmpdir), 'hosts') 23 | make_hosts_file(filename) 24 | result = configuration.AnsibleInventoryParser(filename).nodes 25 | assert result['mons'][0]['host'] == 'mon0' 26 | assert result['osds'][0]['host'] == 'osd0' 27 | 28 | def test_hosts_do_not_get_mixed(self, tmpdir): 29 | filename = os.path.join(str(tmpdir), 'hosts') 30 | make_hosts_file(filename) 31 | result = configuration.AnsibleInventoryParser(filename).nodes 32 | assert len(result['mons']) == 1 33 | assert len(result['osds']) == 1 34 | 35 | def test_ignores_unknown_groups(self, tmpdir): 36 | filename = os.path.join(str(tmpdir), 'hosts') 37 | contents = """ 38 | [mons] 39 | mon0 40 | 41 | [test] 42 | node1 43 | """ 44 | make_hosts_file(filename, contents) 45 | result = configuration.AnsibleInventoryParser(filename).nodes 46 | assert 'test' not in result 47 | 48 | def test_hosts_file_does_not_exist(self): 49 | with pytest.raises(SystemExit): 50 | configuration.load_hosts(_path="/fake/path") 51 | 52 | 53 | class TestNestedInventory(object): 54 | 55 | def test_nested_one_level(self, tmpdir): 56 | filename = os.path.join(str(tmpdir), 'hosts') 57 | contents = """ 58 | [mons:children] 59 | atlanta 60 | 61 | [atlanta] 62 | mon0 63 | """ 64 | make_hosts_file(filename, contents) 65 | result = configuration.AnsibleInventoryParser(filename).nodes 66 | assert result['mons'][0]['host'] == 'mon0' 67 | 68 | def test_nested_one_level_populates_other_groups(self, tmpdir): 69 | filename = os.path.join(str(tmpdir), 'hosts') 70 | contents = """ 71 | [mons:children] 72 | atlanta 73 | 74 | [atlanta] 75 | mon0 76 | """ 77 | make_hosts_file(filename, contents) 78 | result = configuration.AnsibleInventoryParser(filename).nodes 79 | assert result['mons'][0]['host'] == 'mon0' 80 | 81 | def test_nested_levels_populates(self, tmpdir): 82 | filename = os.path.join(str(tmpdir), 'hosts') 83 | contents = """ 84 | [mons:children] 85 | us 86 | 87 | [atlanta] 88 | mon0 89 | 90 | [us:children] 91 | atlanta 92 | """ 93 | make_hosts_file(filename, contents) 94 | result = configuration.AnsibleInventoryParser(filename).nodes 95 | assert result['mons'][0]['host'] == 'mon0' 96 | 97 | 98 | class TestLoadString(object): 99 | 100 | def test_loads_valid_ceph_key(self): 101 | contents = dedent(""" 102 | [global] 103 | cluster = ceph 104 | """) 105 | conf = configuration.load_string(contents) 106 | assert conf.get_safe('global', 'cluster') == 'ceph' 107 | 108 | def test_loads_key_with_spaces_converted(self): 109 | contents = dedent(""" 110 | [global] 111 | some key here = ceph 112 | """) 113 | conf = configuration.load_string(contents) 114 | assert conf.get_safe('global', 'some_key_here') == 'ceph' 115 | -------------------------------------------------------------------------------- /ceph_medic/remote/commands.py: -------------------------------------------------------------------------------- 1 | """ 2 | A collection of helpers that will connect to a remote node to run a system 3 | command to return a specific value, instead of shipping a module and executing 4 | functions remotely, this just uses the current connection to execute Popen 5 | """ 6 | import json 7 | from remoto.process import check 8 | 9 | 10 | def ceph_version(conn): 11 | try: 12 | output, _, exit_code = check(conn, ['ceph', '--version']) 13 | if exit_code != 0: 14 | conn.logger.error('Non zero exit status received, unable to retrieve information') 15 | return 16 | return output[0] 17 | except RuntimeError: 18 | conn.logger.exception('failed to fetch ceph version') 19 | 20 | 21 | def ceph_socket_version(conn, socket): 22 | try: 23 | result = dict() 24 | output, _, exit_code = check( 25 | conn, 26 | ['ceph', '--admin-daemon', socket, '--format', 'json', 'version'] 27 | ) 28 | if exit_code != 0: 29 | conn.logger.error('Non zero exit status received, unable to retrieve information') 30 | return result 31 | try: 32 | result = json.loads(output[0]) 33 | except ValueError: 34 | conn.logger.exception( 35 | "failed to fetch ceph socket version, invalid json: %s" % output[0] 36 | ) 37 | return result 38 | except RuntimeError: 39 | conn.logger.exception('failed to fetch ceph socket version') 40 | 41 | 42 | def ceph_status(conn): 43 | try: # collects information using ceph -s 44 | stdout, stderr, exit_code = check(conn, ['ceph', '-s', '--format', 'json']) 45 | result = dict() 46 | try: 47 | result = json.loads(''.join(stdout)) 48 | except ValueError: 49 | conn.logger.exception("failed to fetch ceph status, invalid json: %s" % ''.join(stdout)) 50 | 51 | if exit_code == 0: 52 | return result 53 | else: 54 | return {} 55 | 56 | except RuntimeError: 57 | conn.logger.exception('failed to fetch ceph status') 58 | 59 | 60 | def ceph_osd_dump(conn): 61 | try: 62 | stdout, stderr, exit_code = check(conn, ['ceph', 'osd', 'dump', '--format', 'json']) 63 | result = dict() 64 | if exit_code != 0: 65 | conn.logger.error('could not get osd dump from ceph') 66 | if stderr: 67 | for line in stderr: 68 | conn.logger.error(line) 69 | return result 70 | try: 71 | result = json.loads(''.join(stdout)) 72 | except ValueError: 73 | conn.logger.exception("failed to fetch osd dump, invalid json: %s" % ''.join(stdout)) 74 | 75 | return result 76 | 77 | except RuntimeError: 78 | conn.logger.exception('failed to fetch ceph osd dump') 79 | 80 | 81 | def daemon_socket_config(conn, socket): 82 | """ 83 | Capture daemon-based config from the socket 84 | """ 85 | try: 86 | output, _, exit_code = check( 87 | conn, 88 | ['ceph', '--admin-daemon', socket, 'config', 'show', '--format', 'json'] 89 | ) 90 | if exit_code != 0: 91 | conn.logger.error('Non zero exit status received, unable to retrieve information') 92 | return 93 | result = dict() 94 | try: 95 | result = json.loads(output[0]) 96 | except ValueError: 97 | conn.logger.exception( 98 | "failed to fetch ceph configuration via socket, invalid json: %s" % output[0] 99 | ) 100 | return result 101 | except RuntimeError: 102 | conn.logger.exception('failed to fetch ceph configuration via socket') 103 | 104 | 105 | def ceph_is_installed(conn): 106 | try: 107 | stdout, stderr, exit_code = check(conn, ['which', 'ceph']) 108 | except RuntimeError: 109 | conn.logger.exception('failed to check if ceph is available in the path') 110 | # XXX this might be incorrect 111 | return False 112 | if exit_code != 0: 113 | return False 114 | return True 115 | -------------------------------------------------------------------------------- /ceph_medic/tests/remote/test_functions.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from ceph_medic.remote import functions 4 | 5 | 6 | def make_test_file(filename, contents=None): 7 | contents = contents or "foo" 8 | with open(filename, 'w') as f: 9 | f.write(contents) 10 | 11 | 12 | def make_test_tree(path, contents=None, tree=None): 13 | file1 = os.path.join(path, "file1.txt") 14 | dir1 = os.path.join(path, "dir1") 15 | file2 = os.path.join(path, "dir1/file2.txt") 16 | make_test_file(file1) 17 | os.mkdir(dir1) 18 | make_test_file(file2) 19 | 20 | 21 | class TestStatPath(object): 22 | 23 | def test_stat_file_includes_owner(self, tmpdir): 24 | filename = os.path.join(str(tmpdir), 'file') 25 | make_test_file(filename) 26 | 27 | result = functions.stat_path(filename) 28 | assert "owner" in result 29 | 30 | def test_stat_file_includes_group(self, tmpdir): 31 | filename = os.path.join(str(tmpdir), 'file') 32 | make_test_file(filename) 33 | 34 | result = functions.stat_path(filename) 35 | assert "group" in result 36 | 37 | def test_includes_file_content(self, tmpdir): 38 | filename = os.path.join(str(tmpdir), 'file') 39 | make_test_file(filename, contents="foo") 40 | 41 | result = functions.stat_path(filename, get_contents=True) 42 | assert result["contents"] == "foo" 43 | 44 | def test_exception_is_empty_on_success(self, tmpdir): 45 | filename = os.path.join(str(tmpdir), 'file') 46 | make_test_file(filename) 47 | 48 | result = functions.stat_path(filename) 49 | assert not result["exception"] 50 | 51 | def test_stat_dir(self, tmpdir): 52 | result = functions.stat_path(str(tmpdir)) 53 | assert result != {} 54 | 55 | def test_no_callables(self, tmpdir): 56 | result = functions.stat_path(str(tmpdir)) 57 | for value in result.values(): 58 | assert callable(value) is False 59 | 60 | 61 | class TestStatPathErrors(object): 62 | 63 | def test_captures_exceptions(self): 64 | result = functions.stat_path('/does/not/exist') 65 | assert result['exception']['attributes']['errno'] == '2' 66 | assert result['exception']['name'] in ['FileNotFoundError', 'OSError'] 67 | 68 | 69 | class AttributeLandMine(object): 70 | 71 | @property 72 | def explode(self): 73 | raise ValueError('Raising on attribute access') 74 | 75 | 76 | class TestCaptureException(object): 77 | 78 | def test_exceptions_in_errors_are_ignored(self): 79 | result = functions.capture_exception(AttributeLandMine()) 80 | assert result['attributes'] == {'explode': None} 81 | 82 | def test_unserializable_attributes(self, factory): 83 | error = factory(unserial=lambda: True) 84 | result = functions.capture_exception(error) 85 | assert '') 41 | except KeyboardInterrupt: 42 | raise SystemExit("\nNot proceeding") 43 | 44 | old = "__version__ = '%s'" % metadata['version'] 45 | new = "__version__ = '%s'" % self.version 46 | 47 | module_file = read_module_contents() 48 | with open('ceph_medic/__init__.py', 'w') as fileh: 49 | fileh.write(module_file.replace(old, new)) 50 | 51 | # Commit everything with a standard commit message 52 | cmd = ['git', 'commit', '-a', '-m', 'version %s' % self.version] 53 | print(' '.join(cmd)) 54 | subprocess.check_call(cmd) 55 | 56 | 57 | class ReleaseCommand(Command): 58 | """ Tag and push a new release. """ 59 | 60 | user_options = [('sign', 's', 'GPG-sign the Git tag and release files')] 61 | 62 | def initialize_options(self): 63 | self.sign = False 64 | 65 | def finalize_options(self): 66 | pass 67 | 68 | def run(self): 69 | # Create Git tag 70 | tag_name = 'v%s' % version 71 | cmd = ['git', 'tag', '-a', tag_name, '-m', 'version %s' % version] 72 | if self.sign: 73 | cmd.append('-s') 74 | print(' '.join(cmd)) 75 | subprocess.check_call(cmd) 76 | 77 | # Push Git tag to origin remote 78 | cmd = ['git', 'push', 'origin', tag_name] 79 | print(' '.join(cmd)) 80 | subprocess.check_call(cmd) 81 | 82 | # Push package to pypi 83 | cmd = ['python', 'setup.py', 'sdist', 'upload'] 84 | if self.sign: 85 | cmd.append('--sign') 86 | print(' '.join(cmd)) 87 | #subprocess.check_call(cmd) 88 | 89 | # Push master to the remote 90 | cmd = ['git', 'push', 'origin', 'master'] 91 | print(' '.join(cmd)) 92 | subprocess.check_call(cmd) 93 | 94 | 95 | setup( 96 | name='ceph-medic', 97 | version=version, 98 | packages=find_packages(), 99 | 100 | author='Alfredo Deza', 101 | author_email='contact@redhat.com', 102 | description='detect common issues with ceph clusters', 103 | long_description=long_description, 104 | license='MIT', 105 | keywords='ceph doctor', 106 | url="https://github.com/ceph/ceph-medic", 107 | zip_safe=False, 108 | 109 | install_requires=[ 110 | 'execnet', 111 | 'tambo', 112 | 'remoto>=1.1.2', 113 | ] + install_requires, 114 | 115 | tests_require=[ 116 | 'pytest >=2.1.3', 117 | 'tox', 118 | 'mock', 119 | ], 120 | 121 | scripts=['bin/ceph-medic'], 122 | cmdclass={'bump': BumpCommand, 'release': ReleaseCommand}, 123 | classifiers=[ 124 | 'Development Status :: 4 - Beta', 125 | 'Intended Audience :: Developers', 126 | 'License :: OSI Approved :: MIT License', 127 | 'Topic :: Software Development :: Build Tools', 128 | 'Topic :: Utilities', 129 | 'Operating System :: MacOS :: MacOS X', 130 | 'Operating System :: POSIX', 131 | 'Programming Language :: Python :: 2.7', 132 | 'Programming Language :: Python :: 3.4', 133 | ] 134 | 135 | ) 136 | -------------------------------------------------------------------------------- /ceph_medic/tests/checks/test_osds.py: -------------------------------------------------------------------------------- 1 | from textwrap import dedent 2 | from ceph_medic.checks import osds 3 | from ceph_medic import metadata 4 | 5 | 6 | class TestOSDS(object): 7 | 8 | def test_fails_check_ceph_fsid(self): 9 | data = {'paths': {'/var/lib/ceph': {'files': { 10 | '/var/lib/ceph/osd/ceph-0/ceph_fsid': {'contents': "fsid1"}, 11 | '/var/lib/ceph/osd/ceph-1/ceph_fsid': {'contents': "fsid2"}, 12 | }}}} 13 | result = osds.check_osd_ceph_fsid(None, data) 14 | assert "WOSD1" in result 15 | 16 | def test_min_pool_size_fails(self, data): 17 | metadata['cluster_name'] = 'ceph' 18 | contents = dedent(""" 19 | [global] 20 | cluster = foo 21 | osd_pool_default_min_size = 1 22 | """) 23 | osd_data = data() 24 | osd_data['paths']['/etc/ceph']['files']['/etc/ceph/ceph.conf'] = {'contents': contents} 25 | code, error = osds.check_min_pool_size(None, osd_data) 26 | assert error == 'osd default pool min_size is set to 1, can potentially lose data' 27 | 28 | def test_min_pool_size_is_correct(self, data): 29 | metadata['cluster_name'] = 'ceph' 30 | contents = dedent(""" 31 | [global] 32 | cluster = foo 33 | osd_pool_default_min_size = 2 34 | """) 35 | osd_data = data() 36 | osd_data['paths']['/etc/ceph']['files']['/etc/ceph/ceph.conf'] = {'contents': contents} 37 | result = osds.check_min_pool_size(None, osd_data) 38 | assert result is None 39 | 40 | 41 | class TestMinOSDS(object): 42 | 43 | def test_min_osd_nodes_is_not_met(self, data): 44 | metadata['osds'] = {'osd1': []} 45 | metadata['cluster_name'] = 'ceph' 46 | osd_data = data() 47 | contents = dedent(""" 48 | [global] 49 | cluster = foo 50 | osd_pool_default_min_size = 2 51 | """) 52 | osd_data['paths']['/etc/ceph']['files']['/etc/ceph/ceph.conf'] = {'contents': contents} 53 | code, error = osds.check_min_osd_nodes(None, osd_data) 54 | assert code == 'WOSD3' 55 | assert '6 needed, 1 found' in error 56 | 57 | def test_min_osd_nodes_is_met(self, data): 58 | metadata['osds'] = dict(('osd%s' % count, []) for count in range(6)) 59 | metadata['cluster_name'] = 'ceph' 60 | osd_data = data() 61 | contents = dedent(""" 62 | [global] 63 | cluster = foo 64 | osd_pool_default_min_size = 2 65 | """) 66 | osd_data['paths']['/etc/ceph']['files']['/etc/ceph/ceph.conf'] = {'contents': contents} 67 | result = osds.check_min_osd_nodes(None, osd_data) 68 | assert result is None 69 | 70 | 71 | class TestReasonableRatios(object): 72 | 73 | def setup(self): 74 | self.data = {'ceph': {'osd': {'dump': {}}}} 75 | 76 | def test_osd_is_empty(self): 77 | data = {'ceph': {'osd': {}}} 78 | assert osds.check_reasonable_ratios('node1', data) is None 79 | 80 | def test_ratios_are_all_very_reasonable(self): 81 | self.data['ceph']['osd']['dump'] = { 82 | "backfillfull_ratio": 0.9, 83 | "nearfull_ratio": 0.85, 84 | "full_ratio": 0.95 85 | } 86 | assert osds.check_reasonable_ratios('node1', self.data) is None 87 | 88 | def test_all_ratios_are_messed_up(self): 89 | self.data['ceph']['osd']['dump'] = { 90 | "backfillfull_ratio": 0.91, 91 | "nearfull_ratio": 0.84, 92 | "full_ratio": 0.92 93 | } 94 | code, msg = osds.check_reasonable_ratios('node1', self.data) 95 | assert code == 'WOSD4' 96 | assert 'backfillfull_ratio, full_ratio, nearfull_ratio' in msg 97 | 98 | def test_backfillfull_is_messed_up(self): 99 | self.data['ceph']['osd']['dump'] = { 100 | "backfillfull_ratio": 0.91, 101 | "nearfull_ratio": 0.85, 102 | "full_ratio": 0.95 103 | } 104 | code, msg = osds.check_reasonable_ratios('node1', self.data) 105 | assert msg.endswith('backfillfull_ratio') 106 | 107 | def test_nearfull_is_messed_up(self): 108 | self.data['ceph']['osd']['dump'] = { 109 | "backfillfull_ratio": 0.9, 110 | "nearfull_ratio": 0.88, 111 | "full_ratio": 0.95 112 | } 113 | code, msg = osds.check_reasonable_ratios('node1', self.data) 114 | assert msg.endswith('nearfull_ratio') 115 | 116 | def test_full_is_messed_up(self): 117 | self.data['ceph']['osd']['dump'] = { 118 | "backfillfull_ratio": 0.9, 119 | "nearfull_ratio": 0.89, 120 | "full_ratio": 0.95 121 | } 122 | code, msg = osds.check_reasonable_ratios('node1', self.data) 123 | assert msg.endswith('full_ratio') 124 | -------------------------------------------------------------------------------- /ceph_medic/connection.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import socket 3 | import remoto 4 | import ceph_medic 5 | from execnet.gateway_bootstrap import HostNotFound 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | def get_connection(hostname, username=None, threads=5, use_sudo=None, detect_sudo=True, **kw): 11 | """ 12 | A very simple helper, meant to return a connection 13 | that will know about the need to use sudo. 14 | """ 15 | if kw.get('logger') is False: # explicitly disable remote logging 16 | remote_logger = None 17 | else: 18 | remote_logger = logging.getLogger(hostname) 19 | 20 | if username: 21 | hostname = "%s@%s" % (username, hostname) 22 | 23 | if ceph_medic.config.ssh_config: 24 | hostname = "-F %s %s" % (ceph_medic.config.ssh_config, hostname) 25 | try: 26 | deployment_type = kw.get( 27 | 'deployment_type', 28 | ceph_medic.config.file.get_safe( 29 | 'global', 'deployment_type', 'baremetal') 30 | ) 31 | conn_obj = remoto.connection.get(deployment_type) 32 | if deployment_type in ['k8s', 'kubernetes', 'openshift', 'oc']: 33 | conn = container_platform_conn(hostname, conn_obj, deployment_type) 34 | # check if conn is ok 35 | stdout, stderr, code = remoto.process.check(conn, ['true']) 36 | if code: 37 | raise HostNotFound( 38 | 'Remote connection failed while testing connection:\n %s' % '\n'.join(stderr)) 39 | elif deployment_type in ['docker', 'podman']: 40 | if kw.get('logger', True): 41 | remote_logger = logging.getLogger(kw['container']) 42 | conn = conn_obj( 43 | hostname, 44 | container_name=kw['container'], 45 | logger=remote_logger, 46 | detect_sudo=detect_sudo, 47 | ) 48 | elif deployment_type in ['ssh', 'baremetal']: 49 | conn = conn_obj( 50 | hostname, 51 | logger=remote_logger, 52 | threads=threads, 53 | detect_sudo=detect_sudo, 54 | ) 55 | else: 56 | raise RuntimeError( 57 | 'Invalid deployment_type: %s' % deployment_type) 58 | # Set a timeout value in seconds to disconnect and move on 59 | # if no data is sent back. 60 | conn.global_timeout = 300 61 | # XXX put this somewhere else 62 | if not ceph_medic.config.cluster_name: 63 | cluster_conf_files, stderr, exit_code = remoto.process.check(conn, ['ls', '/etc/ceph/']) 64 | cluster_name = 'ceph' 65 | if 'ceph.conf' not in cluster_conf_files: 66 | logger.warning('/etc/ceph/ceph.conf was not found, will try to infer the cluster name') 67 | for i in cluster_conf_files: 68 | if i.endswith('conf'): 69 | cluster_name = i.split('.conf')[0] 70 | logger.warning('inferred %s as the cluster name', cluster_name) 71 | ceph_medic.metadata['cluster_name'] = cluster_name 72 | else: 73 | ceph_medic.metadata['cluster_name'] = ceph_medic.config.cluster_name 74 | return conn 75 | except Exception as error: 76 | msg = "connecting to host: %s " % hostname 77 | errors = "resulted in errors: %s %s" % (error.__class__.__name__, error) 78 | logger.error(msg) 79 | logger.error(errors) 80 | raise error 81 | 82 | 83 | def container_platform_conn(hostname, conn_obj, deployment_type): 84 | """ 85 | This helper function is only valid for container platform connections like 86 | OpenShift or Kubernetes. Fetches the configuration needed to properly 87 | configure the connection object, and then returns it. 88 | """ 89 | container_platforms = { 90 | 'k8s': 'kubernetes', 91 | 'kubernetes': 'kubernetes', 92 | 'oc': 'openshift', 93 | 'openshift': 'openshift', 94 | } 95 | deployment_type = container_platforms.get(deployment_type, 'kubernetes') 96 | namespace = ceph_medic.config.file.get_safe(deployment_type, 'namespace', 'rook-ceph') 97 | context = ceph_medic.config.file.get_safe(deployment_type, 'context', None) 98 | return conn_obj(hostname, namespace, context=context) 99 | 100 | 101 | def as_bytes(string): 102 | """ 103 | Ensure that whatever type of string is incoming, it is returned as bytes, 104 | encoding to utf-8 otherwise 105 | """ 106 | if isinstance(string, bytes): 107 | return string 108 | return string.encode('utf-8', errors='ignore') 109 | 110 | 111 | def get_local_connection(logger, use_sudo=False): 112 | """ 113 | Helper for local connections that are sometimes needed to operate 114 | on local hosts 115 | """ 116 | return get_connection( 117 | socket.gethostname(), # cannot rely on 'localhost' here 118 | None, 119 | logger=logger, 120 | threads=1, 121 | use_sudo=use_sudo, 122 | detect_sudo=False 123 | ) 124 | -------------------------------------------------------------------------------- /ceph_medic/checks/mons.py: -------------------------------------------------------------------------------- 1 | from ceph_medic import metadata 2 | from ceph_medic.util import configuration 3 | 4 | # 5 | # Utilities 6 | # 7 | 8 | 9 | def get_secret(data): 10 | """ 11 | keyring files look like:: 12 | 13 | [mon.] 14 | key = AQBvaBFZAAAAABAA9VHgwCg3rWn8fMaX8KL01A== 15 | caps mon = "allow *" 16 | 17 | Fetch that keyring file and extract the actual key, no spaces. 18 | 19 | .. warning:: If multiple mon dirs exist, this utility will pick the first 20 | one it finds. There are checks that will complain about multiple mon dirs 21 | """ 22 | file_paths = data['paths']['/var/lib/ceph']['files'].keys() 23 | _path = data['paths']['/var/lib/ceph']['files'] 24 | for _file in file_paths: 25 | if _file.startswith('/var/lib/ceph/mon/') and _file.endswith('keyring'): 26 | contents = _path[_file]['contents'] 27 | conf = configuration.load_string(contents) 28 | try: 29 | return conf.get_safe('mon.', 'key', '').split('\n')[0] 30 | except IndexError: 31 | # is it really possible to get a keyring file that doesn't 32 | # have a monitor secret? 33 | return '' 34 | 35 | 36 | def get_monitor_dirs(dirs): 37 | """ 38 | Find all the /var/lib/ceph/mon/* directories. This is a bit tricky because 39 | we don't know if there are nested directories (the metadata reports them in 40 | a flat list). 41 | We must go through all of them and make sure that by splitting there aren't 42 | any nested ones and we are only reporting actual monitor dirs. 43 | """ 44 | # get all the actual monitor dirs 45 | found = [] 46 | prefix = '/var/lib/ceph/mon/' 47 | mon_dirs = [d for d in dirs if d.startswith(prefix)] 48 | for _dir in mon_dirs: 49 | # splitting on prefix[-1] will give us: 50 | # 'ceph-mon-1/maybe/nested' or 'ceph-mon-1' 51 | dirs = _dir.split(prefix)[-1].split('/') 52 | # splitting again on '/' and using the first part will ensure we only 53 | # get the dir 54 | found.append(dirs[0]) 55 | return set(found) 56 | 57 | 58 | def get_osd_dirs(dirs): 59 | """ 60 | Find all the /var/lib/ceph/osd/* directories. This is a bit tricky because 61 | we don't know if there are nested directories (the metadata reports them in 62 | a flat list). 63 | We must go through all of them and make sure that by splitting there aren't 64 | any nested ones and we are only reporting actual monitor dirs. 65 | """ 66 | # get all the actual monitor dirs 67 | found = [] 68 | prefix = '/var/lib/ceph/osd/' 69 | osd_dirs = [d for d in dirs if d.startswith(prefix)] 70 | for _dir in osd_dirs: 71 | # splitting on prefix[-1] will give us: 72 | # 'ceph-1/maybe/nested' or 'ceph-1' 73 | dirs = _dir.split(prefix)[-1].split('/') 74 | # splitting again on '/' and using the first part will ensure we only 75 | # get the dir 76 | found.append(dirs[0]) 77 | return set(found) 78 | # 79 | # Error Checks 80 | # 81 | 82 | 83 | def check_mon_secret(host, data): 84 | code = 'EMON1' 85 | msg = 'secret key "%s" is different than host(s): %s' 86 | mismatched_hosts = [] 87 | 88 | current_secret = get_secret(data) 89 | if not current_secret: 90 | # there is no file for the current host, so we can't compare 91 | return 92 | 93 | for host, host_data in metadata['mons'].items(): 94 | host_secret = get_secret(host_data) 95 | if not host_secret: 96 | # cannot compare with another host that may not have the secret 97 | continue 98 | if current_secret != host_secret: 99 | mismatched_hosts.append(host) 100 | 101 | if mismatched_hosts: 102 | return code, msg % (current_secret, ','.join(mismatched_hosts)) 103 | 104 | # 105 | # Warning Checks 106 | # 107 | 108 | 109 | def check_multiple_mon_dirs(host, data): 110 | code = 'WMON1' 111 | msg = 'multiple /var/lib/ceph/mon/* dirs found: %s' 112 | dirs = data['paths']['/var/lib/ceph']['dirs'] 113 | monitor_dirs = get_monitor_dirs(dirs) 114 | if len(monitor_dirs) > 1: 115 | return code, msg % ','.join(monitor_dirs) 116 | 117 | 118 | def check_mon_collocated_with_osd(host, data): 119 | code = 'WMON2' 120 | msg = 'collocated OSDs found: %s' 121 | dirs = data['paths']['/var/lib/ceph']['dirs'] 122 | osd_dirs = get_osd_dirs(dirs) 123 | if len(osd_dirs): 124 | return code, msg % ','.join(osd_dirs) 125 | 126 | 127 | def check_mon_recommended_count(host, data): 128 | code = 'WMON3' 129 | msg = 'Recommended number of MONs (3) not met: %s' 130 | mon_count = len(metadata['mons'].keys()) 131 | if mon_count < 3: 132 | return code, msg % mon_count 133 | 134 | 135 | def check_mon_count_is_odd(host, data): 136 | code = 'WMON4' 137 | msg = 'Number of MONs is not an odd number: %s' 138 | mon_count = len(metadata['mons'].keys()) 139 | if mon_count % 2 == 0: 140 | return code, msg % mon_count 141 | 142 | 143 | def check_for_single_mon(host, data): 144 | code = 'WMON5' 145 | msg = 'A single monitor was detected: %s' 146 | monitors = list(metadata['mons'].keys()) 147 | if len(monitors) == 1: 148 | return code, msg % monitors.pop() 149 | -------------------------------------------------------------------------------- /ceph_medic/tests/util/test_hosts.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pytest 3 | from ceph_medic.util import hosts, configuration 4 | import ceph_medic 5 | from textwrap import dedent 6 | 7 | 8 | def failed_check(raise_=True): 9 | if raise_: 10 | raise RuntimeError('command failed') 11 | else: 12 | return dict(stdout='', stderr='', code=1) 13 | 14 | 15 | class TestContainerPlatform(object): 16 | 17 | def test_oc_executable_fails(self, monkeypatch, capsys): 18 | monkeypatch.setattr(hosts.process, 'check', lambda *a: failed_check()) 19 | hosts.container_platform() 20 | stdout, stderr = capsys.readouterr() 21 | assert 'Unable to retrieve the pods using command' in stdout 22 | assert 'oc --request-timeout=5 get -n rook-ceph pods -o json' in stdout 23 | 24 | def test_kubectl_executable_fails(self, monkeypatch, capsys): 25 | monkeypatch.setattr(hosts.process, 'check', lambda *a: failed_check()) 26 | hosts.container_platform('kubernetes') 27 | stdout, stderr = capsys.readouterr() 28 | assert 'Unable to retrieve the pods using command' in stdout 29 | assert 'kubectl --request-timeout=5 get -n rook-ceph pods -o json' in stdout 30 | 31 | def test_no_context(self, stub_check): 32 | check = stub_check((['{"items": {}}'], [], 1)) 33 | hosts.container_platform('kubernetes') 34 | command = check.calls[0]['args'][1] 35 | assert command == [ 36 | 'kubectl', '--request-timeout=5', 'get', '-n', 37 | 'rook-ceph', 'pods', '-o', 'json' 38 | ] 39 | 40 | def test_garbage_stdout(self, stub_check, capsys): 41 | stub_check((['could not contact platform'], [], 1)) 42 | with pytest.raises(SystemExit): 43 | hosts.container_platform('kubernetes') 44 | stdout, stderr = capsys.readouterr() 45 | assert 'Unable to load JSON from stdout' in stdout 46 | assert 'could not contact platform' in stdout 47 | 48 | def test_garbage_stderr(self, stub_check, capsys): 49 | stub_check(([], ['could not contact platform'], 1)) 50 | with pytest.raises(SystemExit): 51 | hosts.container_platform('kubernetes') 52 | stdout, stderr = capsys.readouterr() 53 | assert 'Unable to load JSON from stdout' in stdout 54 | assert 'could not contact platform' in stdout 55 | 56 | def test_kubectl_with_context(self, stub_check): 57 | contents = dedent(""" 58 | [kubernetes] 59 | context = 87 60 | """) 61 | conf = configuration.load_string(contents) 62 | ceph_medic.config.file = conf 63 | check = stub_check((['{"items": {}}'], [], 1)) 64 | hosts.container_platform('kubernetes') 65 | command = check.calls[0]['args'][1] 66 | assert command == [ 67 | 'kubectl', '--context', '87', '--request-timeout=5', 'get', '-n', 68 | 'rook-ceph', 'pods', '-o', 'json' 69 | ] 70 | 71 | def test_oc_with_context(self, stub_check): 72 | contents = dedent(""" 73 | [openshift] 74 | context = 87 75 | """) 76 | conf = configuration.load_string(contents) 77 | ceph_medic.config.file = conf 78 | check = stub_check((['{"items": {}}'], [], 1)) 79 | hosts.container_platform() 80 | command = check.calls[0]['args'][1] 81 | assert command == [ 82 | 'oc', '--context', '87', '--request-timeout=5', 'get', '-n', 83 | 'rook-ceph', 'pods', '-o', 'json' 84 | ] 85 | 86 | 87 | class TestBasicContainers(object): 88 | binaries = ['docker', 'podman'] 89 | 90 | @pytest.mark.parametrize('binary', binaries) 91 | def test_executable_fails( 92 | self, binary, monkeypatch, make_nodes, capsys): 93 | monkeypatch.setattr(hosts.config, 'nodes', make_nodes(mgrs=['mgr0'])) 94 | monkeypatch.setattr( 95 | hosts.ceph_medic.connection, 'get_connection', 96 | lambda *a, **k: None) 97 | monkeypatch.setattr( 98 | hosts.process, 'check', lambda *a: failed_check(False)) 99 | hosts.basic_containers(binary) 100 | stdout, stderr = capsys.readouterr() 101 | assert 'Unable to list containers on host mgr0' in stdout 102 | 103 | @pytest.mark.parametrize('binary', binaries) 104 | def test_inspection( 105 | self, binary, monkeypatch, make_nodes, stub_check, capsys): 106 | monkeypatch.setattr(ceph_medic.config, 'cluster_name', 'ceph') 107 | monkeypatch.setattr(hosts.config, 'nodes', make_nodes(mgrs=['mgr0'])) 108 | monkeypatch.setattr( 109 | hosts.ceph_medic.connection, 'get_connection', 110 | lambda *a, **k: None) 111 | fake_list = '\n'.join(['mgr0-container']) 112 | fake_mgr = json.dumps([{ 113 | 'Name': 'mgr0-container', 114 | 'Config': { 115 | 'Env': [ 116 | 'CLUSTER=ceph', 117 | 'CEPH_DAEMON=MGR', 118 | ] 119 | } 120 | }]) 121 | stub_check([ 122 | ([fake_mgr], [''], 0), 123 | ([fake_list], [''], 0), 124 | ]) 125 | result = hosts.basic_containers(binary) 126 | assert result['mgrs'][0]['host'] == 'mgr0' 127 | assert result['mgrs'][0]['container'] == 'mgr0-container' 128 | -------------------------------------------------------------------------------- /ceph_medic/tests/checks/test_mons.py: -------------------------------------------------------------------------------- 1 | from ceph_medic import metadata 2 | from ceph_medic.checks import mons 3 | 4 | 5 | class TestGetSecret(object): 6 | 7 | def setup(self): 8 | self.data = { 9 | 'paths': { 10 | '/var/lib/ceph': { 11 | 'files': { 12 | '/var/lib/ceph/mon/ceph-mon-0/keyring': { 13 | 'contents': '', 14 | } 15 | } 16 | } 17 | } 18 | } 19 | 20 | def set_contents(self, string, file_path=None): 21 | file_path = file_path or '/var/lib/ceph/mon/ceph-mon-0/keyring' 22 | self.data['paths']['/var/lib/ceph']['files'][file_path]['contents'] = string 23 | 24 | def test_get_secret(self): 25 | contents = """ 26 | [mon.] 27 | key = AQBvaBFZAAAAABAA9VHgwCg3rWn8fMaX8KL01A== 28 | caps mon = "allow *" 29 | """ 30 | self.set_contents(contents) 31 | result = mons.get_secret(self.data) 32 | assert result == 'AQBvaBFZAAAAABAA9VHgwCg3rWn8fMaX8KL01A==' 33 | 34 | def test_get_no_secret_empty_file(self): 35 | result = mons.get_secret(self.data) 36 | assert result == '' 37 | 38 | def test_get_no_secret_wrong_file(self): 39 | contents = """ 40 | [mon.] 41 | caps mon = "allow *" 42 | """ 43 | self.set_contents(contents) 44 | result = mons.get_secret(self.data) 45 | assert result == '' 46 | 47 | 48 | class TestGetMonitorDirs(object): 49 | 50 | def test_get_monitor_dirs(self): 51 | result = mons.get_monitor_dirs([ 52 | '/var/lib/ceph/mon/ceph-mon-1', 53 | '/var/lib/ceph/something']) 54 | 55 | assert result == set(['ceph-mon-1']) 56 | 57 | def test_cannot_get_monitor_dirs(self): 58 | result = mons.get_monitor_dirs([ 59 | '/var/lib/ceph/osd/ceph-osd-1', 60 | '/var/lib/ceph/something']) 61 | assert result == set([]) 62 | 63 | def test_get_monitor_dirs_multiple(self): 64 | result = mons.get_monitor_dirs([ 65 | '/var/lib/ceph/mon/ceph-mon-1', 66 | '/var/lib/ceph/mon/ceph-mon-3', 67 | '/var/lib/ceph/mon/ceph-mon-2', 68 | '/var/lib/ceph/something']) 69 | 70 | assert result == set(['ceph-mon-1', 'ceph-mon-2', 'ceph-mon-3']) 71 | 72 | def test_get_monitor_dirs_nested_multiple(self): 73 | result = mons.get_monitor_dirs([ 74 | '/var/lib/ceph/mon/ceph-mon-1', 75 | '/var/lib/ceph/mon/ceph-mon-1/nested/dir/', 76 | '/var/lib/ceph/mon/ceph-mon-1/other/nested', 77 | '/var/lib/ceph/mon/ceph-mon-2', 78 | '/var/lib/ceph/something']) 79 | 80 | assert result == set(['ceph-mon-1', 'ceph-mon-2']) 81 | 82 | 83 | class TestOsdDirs(object): 84 | 85 | def test_get_osd_dirs_nested_multiple(self): 86 | result = mons.get_osd_dirs([ 87 | '/var/lib/ceph/osd/ceph-1', 88 | '/var/lib/ceph/osd/ceph-1/nested/dir/', 89 | '/var/lib/ceph/osd/ceph-1/other/nested', 90 | '/var/lib/ceph/osd/ceph-2', 91 | '/var/lib/ceph/something']) 92 | 93 | assert result == set(['ceph-1', 'ceph-2']) 94 | 95 | 96 | class TestMonRecommendedCount(object): 97 | 98 | def test_recommended_count_is_met(self, data): 99 | metadata['mons'] = dict(('mon%s' % count, []) for count in range(6)) 100 | metadata['cluster_name'] = 'ceph' 101 | osd_data = data() 102 | result = mons.check_mon_recommended_count(None, osd_data) 103 | assert result is None 104 | 105 | def test_recommended_count_is_unmet(self, data): 106 | metadata['mons'] = dict(('mon%s' % count, []) for count in range(1)) 107 | metadata['cluster_name'] = 'ceph' 108 | osd_data = data() 109 | code, message = mons.check_mon_recommended_count(None, osd_data) 110 | assert code == 'WMON3' 111 | assert message == 'Recommended number of MONs (3) not met: 1' 112 | 113 | 114 | class TestMonCountIsOdd(object): 115 | 116 | def test_count_is_odd(self, data): 117 | metadata['mons'] = dict(('mon%s' % count, []) for count in range(3)) 118 | metadata['cluster_name'] = 'ceph' 119 | osd_data = data() 120 | result = mons.check_mon_count_is_odd(None, osd_data) 121 | assert result is None 122 | 123 | def test_recommended_count_is_unmet(self, data): 124 | metadata['mons'] = dict(('mon%s' % count, []) for count in range(2)) 125 | metadata['cluster_name'] = 'ceph' 126 | osd_data = data() 127 | code, message = mons.check_mon_count_is_odd(None, osd_data) 128 | assert code == 'WMON4' 129 | assert message == 'Number of MONs is not an odd number: 2' 130 | 131 | 132 | class TestSingleMon(object): 133 | 134 | def test_is_single(self, data): 135 | metadata['mons'] = {'mon.0': []} 136 | metadata['cluster_name'] = 'ceph' 137 | code, message = mons.check_for_single_mon(None, data()) 138 | assert code == 'WMON5' 139 | assert message == 'A single monitor was detected: mon.0' 140 | 141 | def test_is_not_single(self, data): 142 | metadata['mons'] = dict(('mon%s' % count, []) for count in range(2)) 143 | metadata['cluster_name'] = 'ceph' 144 | result = mons.check_for_single_mon(None, data()) 145 | assert result is None 146 | -------------------------------------------------------------------------------- /ceph_medic/util/hosts.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import ceph_medic.connection 4 | from ceph_medic import config, terminal 5 | from remoto import connection, process 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | def _platform_options(platform): 11 | try: 12 | namespace = config.file.get_safe(platform, 'namespace', 'rook-ceph') 13 | context = config.file.get_safe(platform, 'context', None) 14 | except RuntimeError: 15 | namespace = 'rook-ceph' 16 | context = None 17 | return {'namespace': namespace, 'context': context} 18 | 19 | 20 | def container_platform(platform='openshift'): 21 | """ 22 | Connect to a container platform (kubernetes or openshift), retrieve all the 23 | available pods that match the namespace (defaults to 'rook-ceph'), and 24 | return a dictionary including them, regardless of state. 25 | """ 26 | local_conn = connection.get('local')() 27 | options = _platform_options(platform) 28 | context = options.get('context') 29 | namespace = options.get('namespace') 30 | executable = 'oc' if platform == 'openshift' else 'kubectl' 31 | 32 | if context: 33 | cmd = [executable, '--context', context] 34 | else: 35 | cmd = [executable] 36 | 37 | cmd.extend(['--request-timeout=5', 'get', '-n', namespace, 'pods', '-o', 'json']) 38 | 39 | try: 40 | out, err, code = process.check(local_conn, cmd) 41 | except RuntimeError: 42 | out = "{}" 43 | terminal.error('Unable to retrieve the pods using command: %s' % ' '.join(cmd)) 44 | else: 45 | if code: 46 | output = out + err 47 | for line in output: 48 | terminal.error(line) 49 | 50 | try: 51 | pods = json.loads(''.join(out)) 52 | except Exception: 53 | # Python3 has JSONDecodeError which doesn't exist in Python2 54 | # Python2 just raises ValueError 55 | stdout = ''.join(out) 56 | stderr = ''.join(err) 57 | logger.exception('Invalid JSON from stdout') 58 | terminal.error('Unable to load JSON from stdout') 59 | if stdout: 60 | logger.error('stdout: %s', stdout) 61 | terminal.error('stdout: %s' % stdout) 62 | if stderr: 63 | logger.error('stderr: %s', stderr) 64 | terminal.error('stderr: %s' % stderr) 65 | raise SystemExit(1) 66 | 67 | base_inventory = { 68 | 'rgws': [], 'mgrs': [], 'mdss': [], 'clients': [], 'osds': [], 'mons': [] 69 | } 70 | label_map = { 71 | 'rook-ceph-mgr': 'mgrs', 72 | 'rook-ceph-mon': 'mons', 73 | 'rook-ceph-osd': 'osds', 74 | 'rook-ceph-mds': 'mdss', 75 | 'rook-ceph-rgw': 'rgws', 76 | 'rook-ceph-client': 'clients', 77 | } 78 | 79 | for item in pods.get('items', {}): 80 | label_name = item['metadata'].get('labels', {}).get('app') 81 | if not label_name: 82 | continue 83 | if label_name in label_map: 84 | inventory_key = label_map[label_name] 85 | base_inventory[inventory_key].append( 86 | {'host': item['metadata']['name'], 'group': None} 87 | ) 88 | for key, value in dict(base_inventory).items(): 89 | if not value: 90 | base_inventory.pop(key) 91 | return base_inventory 92 | 93 | 94 | def basic_containers(deployment_type): 95 | base_inventory = { 96 | 'rgws': [], 'mgrs': [], 'mdss': [], 'clients': [], 'osds': [], 97 | 'mons': [] 98 | } 99 | label_map = { 100 | 'OSD': 'osds', 101 | 'OSD_CEPH_VOLUME_ACTIVATE': 'osds', 102 | 'MON': 'mons', 103 | 'MGR': 'mgrs', 104 | 'MDS': 'mdss', 105 | 'RGW': 'rgws', 106 | } 107 | metal_hosts = set() 108 | for nodes in config.nodes.values(): 109 | for node in nodes: 110 | metal_hosts.add(node['host']) 111 | for host in metal_hosts: 112 | logger.debug("listing containers for host %s", host) 113 | cmd = [deployment_type, 'container', 'ls', '--format', 114 | '"{{ .Names }}"'] 115 | conn = ceph_medic.connection.get_connection( 116 | host, deployment_type='ssh') 117 | out, err, code = process.check(conn, cmd) 118 | if code: 119 | terminal.error("Unable to list containers on host %s" % host) 120 | continue 121 | container_list = map(lambda i: i.strip('"'), out) 122 | if not container_list: 123 | terminal.warning("Host %s had no containers" % host) 124 | continue 125 | for container_name in container_list: 126 | cmd = [deployment_type, 'container', 'inspect', container_name] 127 | out, err, code = process.check(conn, cmd) 128 | if code: 129 | terminal.error( 130 | "Unable to inspect container %s on host %s" % 131 | (container_name, host) 132 | ) 133 | continue 134 | detail = json.loads(''.join(out))[0] 135 | env = dict( 136 | [s.split('=', 1) for s in detail['Config']['Env']]) 137 | if 'CEPH_DAEMON' not in env: 138 | continue 139 | if env.get('CLUSTER') != config.cluster_name: 140 | continue 141 | role = env['CEPH_DAEMON'] 142 | if role not in label_map: 143 | continue 144 | base_inventory[label_map[role]].append( 145 | {'host': host, 'container': container_name, 'group': None} 146 | ) 147 | return base_inventory 148 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # ceph-medic documentation build configuration file, created by 4 | # sphinx-quickstart on Tue Jun 27 14:32:23 2017. 5 | # 6 | # This file is execfile()d with the current directory set to its 7 | # containing dir. 8 | # 9 | # Note that not all possible configuration values are present in this 10 | # autogenerated file. 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | # If extensions (or modules to document with autodoc) are in another directory, 16 | # add these directories to sys.path here. If the directory is relative to the 17 | # documentation root, use os.path.abspath to make it absolute, like shown here. 18 | # 19 | # import os 20 | # import sys 21 | import os 22 | import sys 23 | # sys.path.insert(0, os.path.abspath('.')) 24 | sys.path.append(os.path.abspath('_themes')) 25 | 26 | 27 | # -- General configuration ------------------------------------------------ 28 | 29 | # If your documentation needs a minimal Sphinx version, state it here. 30 | # 31 | # needs_sphinx = '1.0' 32 | 33 | # Add any Sphinx extension module names here, as strings. They can be 34 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 35 | # ones. 36 | extensions = [] 37 | 38 | # Add any paths that contain templates here, relative to this directory. 39 | templates_path = ['_templates'] 40 | 41 | # The suffix(es) of source filenames. 42 | # You can specify multiple suffix as a list of string: 43 | # 44 | # source_suffix = ['.rst', '.md'] 45 | source_suffix = '.rst' 46 | 47 | # The master toctree document. 48 | master_doc = 'contents' 49 | 50 | # General information about the project. 51 | project = u'ceph-medic' 52 | copyright = u'2017, Andrew Schoen, Alfredo Deza' 53 | author = u'Andrew Schoen, Alfredo Deza' 54 | 55 | # The version info for the project you're documenting, acts as replacement for 56 | # |version| and |release|, also used in various other places throughout the 57 | # built documents. 58 | # 59 | # The short X.Y version. 60 | version = u'0.0.1' 61 | # The full version, including alpha/beta/rc tags. 62 | release = u'0.0.1' 63 | 64 | # The language for content autogenerated by Sphinx. Refer to documentation 65 | # for a list of supported languages. 66 | # 67 | # This is also used if you do content translation via gettext catalogs. 68 | # Usually you set "language" from the command line for these cases. 69 | language = None 70 | 71 | # List of patterns, relative to source directory, that match files and 72 | # directories to ignore when looking for source files. 73 | # This patterns also effect to html_static_path and html_extra_path 74 | exclude_patterns = [] 75 | 76 | # The name of the Pygments (syntax highlighting) style to use. 77 | pygments_style = 'sphinx' 78 | 79 | # If true, `todo` and `todoList` produce output, else they produce nothing. 80 | todo_include_todos = False 81 | 82 | 83 | # -- Options for HTML output ---------------------------------------------- 84 | 85 | # The theme to use for HTML and HTML Help pages. See the documentation for 86 | # a list of builtin themes. 87 | # 88 | html_theme = 'ceph' 89 | # Add any paths that contain custom themes here, relative to this directory. 90 | html_theme_path = ['_themes'] 91 | html_show_sphinx = False 92 | html_sidebars = { 93 | '**': ['smarttoc.html', 'searchbox.html'], 94 | } 95 | 96 | # Theme options are theme-specific and customize the look and feel of a theme 97 | # further. For a list of options available for each theme, see the 98 | # documentation. 99 | # 100 | # html_theme_options = {} 101 | 102 | # Add any paths that contain custom static files (such as style sheets) here, 103 | # relative to this directory. They are copied after the builtin static files, 104 | # so a file named "default.css" will overwrite the builtin "default.css". 105 | html_static_path = ['_static'] 106 | 107 | 108 | # -- Options for HTMLHelp output ------------------------------------------ 109 | 110 | # Output file base name for HTML help builder. 111 | htmlhelp_basename = 'ceph-medicdoc' 112 | 113 | 114 | # -- Options for LaTeX output --------------------------------------------- 115 | 116 | latex_elements = { 117 | # The paper size ('letterpaper' or 'a4paper'). 118 | # 119 | # 'papersize': 'letterpaper', 120 | 121 | # The font size ('10pt', '11pt' or '12pt'). 122 | # 123 | # 'pointsize': '10pt', 124 | 125 | # Additional stuff for the LaTeX preamble. 126 | # 127 | # 'preamble': '', 128 | 129 | # Latex figure (float) alignment 130 | # 131 | # 'figure_align': 'htbp', 132 | } 133 | 134 | # Grouping the document tree into LaTeX files. List of tuples 135 | # (source start file, target name, title, 136 | # author, documentclass [howto, manual, or own class]). 137 | latex_documents = [ 138 | (master_doc, 'ceph-medic.tex', u'ceph-medic Documentation', 139 | u'Andrew Schoen, Alfredo Deza', 'manual'), 140 | ] 141 | 142 | 143 | # -- Options for manual page output --------------------------------------- 144 | 145 | # One entry per manual page. List of tuples 146 | # (source start file, name, description, authors, manual section). 147 | man_pages = [ 148 | (master_doc, 'ceph-medic', u'ceph-medic Documentation', 149 | [author], 1) 150 | ] 151 | 152 | 153 | # -- Options for Texinfo output ------------------------------------------- 154 | 155 | # Grouping the document tree into Texinfo files. List of tuples 156 | # (source start file, target name, title, author, 157 | # dir menu entry, description, category) 158 | texinfo_documents = [ 159 | (master_doc, 'ceph-medic', u'ceph-medic Documentation', 160 | author, 'ceph-medic', 'One line description of project.', 161 | 'Miscellaneous'), 162 | ] 163 | 164 | 165 | 166 | -------------------------------------------------------------------------------- /ceph_medic/terminal.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | class colorize(str): 5 | """ 6 | Pretty simple to use:: 7 | 8 | colorize.make('foo').bold 9 | colorize.make('foo').green 10 | colorize.make('foo').yellow 11 | colorize.make('foo').red 12 | colorize.make('foo').blue 13 | 14 | Otherwise you could go the long way (for example if you are 15 | testing this class):: 16 | 17 | string = colorize('foo') 18 | string._set_attributes() 19 | string.red 20 | 21 | """ 22 | 23 | def __init__(self, string): 24 | self.stdout = sys.__stdout__ 25 | self.appends = '' 26 | self.prepends = '' 27 | self.isatty = self.stdout.isatty() 28 | 29 | def _set_attributes(self): 30 | """ 31 | Sets the attributes here because the str class does not 32 | allow to pass in anything other than a string to the constructor 33 | so we can't really mess with the other attributes. 34 | """ 35 | for k, v in self.__colors__.items(): 36 | setattr(self, k, self.make_color(v)) 37 | 38 | def make_color(self, color): 39 | if not self.isatty or self.is_windows: 40 | return self 41 | return color + self + '\033[0m' + self.appends 42 | 43 | @property 44 | def __colors__(self): 45 | return dict( 46 | blue = '\033[34m', 47 | green = '\033[92m', 48 | yellow = '\033[33m', 49 | red = '\033[91m', 50 | bold = '\033[1m', 51 | ends = '\033[0m' 52 | ) 53 | 54 | @property 55 | def is_windows(self): 56 | if sys.platform == 'win32': 57 | return True 58 | return False 59 | 60 | @classmethod 61 | def make(cls, string): 62 | """ 63 | A helper method to return itself and workaround the fact that 64 | the str object doesn't allow extra arguments passed in to the 65 | constructor 66 | """ 67 | obj = cls(string) 68 | obj._set_attributes() 69 | return obj 70 | 71 | # 72 | # Common string manipulations 73 | # 74 | red_arrow = colorize.make('-->').red 75 | blue_arrow = colorize.make('-->').blue 76 | yellow = lambda x: colorize.make(x).yellow 77 | blue = lambda x: colorize.make(x).blue 78 | green = lambda x: colorize.make(x).green 79 | red = lambda x: colorize.make(x).red 80 | bold = lambda x: colorize.make(x).bold 81 | 82 | 83 | CRITICAL = 5 84 | ERROR = 4 85 | WARNING = 3 86 | INFO = 2 87 | DEBUG = 1 88 | 89 | _level_names = { 90 | CRITICAL : 'critical', 91 | WARNING : 'warning', 92 | INFO : 'info', 93 | ERROR : 'error', 94 | DEBUG : 'debug' 95 | } 96 | 97 | _reverse_level_names = dict((v, k) for (k, v) in _level_names.items()) 98 | 99 | _level_colors = { 100 | 'remote' : 'bold', 101 | 'critical' : 'red', 102 | 'warning' : 'yellow', 103 | 'info' : 'blue', 104 | 'debug' : 'blue', 105 | 'error' : 'red' 106 | } 107 | 108 | 109 | class _Write(object): 110 | 111 | def __init__(self, _writer=None, prefix='', suffix='', clear_line=False, flush=False): 112 | self._writer = _writer or sys.stdout 113 | self.suffix = suffix 114 | self.prefix = prefix 115 | self.flush = flush 116 | self.clear_line = clear_line 117 | 118 | def bold(self, string): 119 | self.write(bold(string)) 120 | 121 | def raw(self, string): 122 | self.write(string + '\n') 123 | 124 | def write(self, line): 125 | padding = '' 126 | if self.clear_line: 127 | if len(line) > 80: 128 | padding = ' ' * 10 129 | else: 130 | padding = ' ' * (80 - len(line)) 131 | line = line + padding 132 | self._writer.write(self.prefix + line + self.suffix) 133 | if self.flush: 134 | self._writer.flush() 135 | 136 | 137 | write = _Write() 138 | loader = _Write(prefix='\r', clear_line=True) 139 | 140 | 141 | class LogMessage(object): 142 | 143 | def __init__(self, level_name, message, writer=None, config_level=None): 144 | self.level_name = level_name 145 | self.message = message 146 | self.writer = writer or sys.stdout 147 | self.config_level = config_level or self.get_config_level() 148 | 149 | def skip(self): 150 | if self.level_int >= self.config_level: 151 | return False 152 | return True 153 | 154 | def header(self): 155 | colored = colorize.make(self.base_string) 156 | return getattr(colored, self.level_color) 157 | 158 | @property 159 | def base_string(self): 160 | if self.config_level < 2: 161 | return "--> [%s]" % self.level_name 162 | return "-->" 163 | 164 | @property 165 | def level_int(self): 166 | if self.level_name == 'remote': 167 | return 2 168 | return _reverse_level_names.get(self.level_name, 4) 169 | 170 | @property 171 | def level_color(self): 172 | return _level_colors.get(self.level_name, 'info') 173 | 174 | def line(self): 175 | msg = self.message.rstrip('\n') 176 | return "%s %s\n" % (self.header(), msg) 177 | 178 | def write(self): 179 | if not self.skip(): 180 | self.writer.write(self.line()) 181 | 182 | def get_config_level(self): 183 | import ceph_medic 184 | level = ceph_medic.config.verbosity 185 | return _reverse_level_names.get(level, 4) 186 | 187 | 188 | def error(message): 189 | return LogMessage('error', message).write() 190 | 191 | 192 | def debug(message): 193 | return LogMessage('debug', message).write() 194 | 195 | 196 | def info(message): 197 | return LogMessage('info', message).write() 198 | 199 | 200 | def warning(message): 201 | return LogMessage('warning', message).write() 202 | 203 | 204 | def critical(message): 205 | return LogMessage('critical', message).write() 206 | -------------------------------------------------------------------------------- /ceph_medic/tests/test_collector.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from ceph_medic import collector, metadata 4 | from mock import Mock 5 | 6 | 7 | class FakeConnRemoteModule(object): 8 | """ 9 | A fake remote_module class to be used 10 | with Mocked connection objects. 11 | 12 | This class contains stubbed methods for functions 13 | in ceph_medic.remote.functions which get their return 14 | value from the class attribute return_values. 15 | 16 | When creating an instance pass a dictionary that maps 17 | function names to their return values. 18 | """ 19 | 20 | def __init__(self, return_values): 21 | self.return_values = return_values 22 | 23 | def stat_path(self, *args, **kwargs): 24 | return self.return_values.get('stat_path', {}) 25 | 26 | def path_tree(self, *args, **kwargs): 27 | return self.return_values.get('path_tree', {}) 28 | 29 | 30 | def get_tree(files=None, dirs=None): 31 | if files is None: 32 | files = ["file1.txt"] 33 | if dirs is None: 34 | dirs = ["dir1"] 35 | tree = dict( 36 | files=files, 37 | dirs=dirs, 38 | ) 39 | return tree 40 | 41 | 42 | def get_mock_connection(data=None, files=None, dirs=None): 43 | conn = Mock() 44 | tree = get_tree(files=files, dirs=dirs) 45 | default_data = dict( 46 | path_tree=tree 47 | ) 48 | data = data or default_data 49 | conn.remote_module = FakeConnRemoteModule(data) 50 | return conn 51 | 52 | 53 | class TestCollectPathMetadata(object): 54 | 55 | def test_metadata_includes_dirs(self): 56 | conn = get_mock_connection() 57 | result = collector.get_path_metadata(conn, "/some/path") 58 | assert "dirs" in result 59 | 60 | def test_metadata_includes_files(self): 61 | conn = get_mock_connection() 62 | result = collector.get_path_metadata(conn, "/some/path") 63 | assert "dirs" in result 64 | 65 | def test_metadata_includes_root_path(self): 66 | conn = get_mock_connection() 67 | result = collector.get_path_metadata(conn, "/some/path") 68 | assert "/some/path" in result["dirs"] 69 | 70 | def test_collects_root_path_when_no_files_or_dirs(self): 71 | conn = get_mock_connection(files=[], dirs=[]) 72 | result = collector.get_path_metadata(conn, "/some/path") 73 | assert "/some/path" in result["dirs"] 74 | 75 | 76 | class TestCollectPaths(object): 77 | 78 | @pytest.mark.parametrize( 79 | 'path', 80 | ['/etc/ceph', '/var/lib/ceph', '/var/run/ceph'], 81 | ) 82 | def test_includes_paths(self, path, monkeypatch): 83 | def mock_metadata(conn, p, **kw): 84 | return dict() 85 | monkeypatch.setattr(collector, 'get_path_metadata', mock_metadata) 86 | result = collector.collect_paths(Mock()) 87 | assert path in result 88 | 89 | 90 | class TestCollectSocketInfo(object): 91 | 92 | def tests_collects_sockets(self, monkeypatch): 93 | monkeypatch.setattr(collector.remote.commands, 'ceph_socket_version', lambda conn, socket: dict()) 94 | monkeypatch.setattr(collector.remote.commands, 'daemon_socket_config', lambda conn, socket: dict()) 95 | metadata = { 96 | 'paths': { 97 | '/var/run/ceph': {'files': ['/var/run/ceph/osd.asok']}, 98 | }, 99 | } 100 | result = collector.collect_socket_info(Mock(), metadata) 101 | assert '/var/run/ceph/osd.asok' in result 102 | 103 | def test_ignores_unknown_files(self, monkeypatch): 104 | monkeypatch.setattr(collector.remote.commands, 'ceph_socket_version', lambda conn, socket: dict()) 105 | monkeypatch.setattr(collector.remote.commands, 'daemon_socket_config', lambda conn, socket: dict()) 106 | metadata = { 107 | 'paths': { 108 | '/var/run/ceph': {'files': ['/var/run/ceph/osd.asok', '/var/run/ceph/osd.log']}, 109 | }, 110 | } 111 | result = collector.collect_socket_info(Mock(), metadata) 112 | assert '/var/run/ceph/osd.log' not in result 113 | 114 | 115 | class TestCollect(object): 116 | 117 | def test_ignores_unknown_group(self): 118 | metadata["nodes"] = dict(test=[]) 119 | # raises a RuntimeError because all nodes fail to connect 120 | with pytest.raises(RuntimeError): 121 | collector.collect() 122 | 123 | def test_collects_node_metadata(self, monkeypatch): 124 | metadata["nodes"] = { 125 | "mons": [{"host": "mon0"}], 126 | "osds": [{"host": "osd0"}], 127 | } 128 | metadata["cluster_name"] = "ceph" 129 | def mock_metadata(conn, hostname, cluster_nodes): 130 | return dict(meta="data") 131 | monkeypatch.setattr(collector, "get_connection", 132 | lambda host, container=None: Mock()) 133 | monkeypatch.setattr(collector, "get_node_metadata", mock_metadata) 134 | monkeypatch.setattr(collector, "collect_cluster", lambda x: {}) 135 | collector.collect() 136 | assert "mon0" in metadata["mons"] 137 | assert "meta" in metadata["mons"]["mon0"] 138 | 139 | 140 | class TestGetNodeMetadata(object): 141 | 142 | @pytest.mark.parametrize( 143 | 'key', 144 | ['ceph', 'devices', 'paths', 'network',], 145 | ) 146 | def test_collects_metadata(self, key, monkeypatch): 147 | def mock_metadata(*args, **kwargs): 148 | return dict(meta="data") 149 | monkeypatch.setattr(collector, "collect_devices", mock_metadata) 150 | monkeypatch.setattr(collector, "collect_paths", mock_metadata) 151 | monkeypatch.setattr(collector, "collect_network", mock_metadata) 152 | monkeypatch.setattr(collector, "collect_ceph_info", mock_metadata) 153 | monkeypatch.setattr(collector, "collect_socket_info", mock_metadata) 154 | monkeypatch.setattr(collector, "collect_ceph_osd_info", mock_metadata) 155 | result = collector.get_node_metadata(Mock(), "mon0", []) 156 | assert key in result 157 | -------------------------------------------------------------------------------- /ceph_medic/remote/functions.py: -------------------------------------------------------------------------------- 1 | import os 2 | import grp 3 | import pwd 4 | import traceback 5 | import sys 6 | import subprocess 7 | 8 | 9 | # Utilities 10 | # 11 | def capture_exception(error): 12 | details = {'attributes': {}} 13 | details['name'] = error.__class__.__name__ 14 | details['repr'] = str(error) 15 | exc_type, exc_value, exc_traceback = sys.exc_info() 16 | details['traceback'] = ''.join(traceback.format_exception(exc_type, exc_value, exc_traceback)) 17 | for attr in dir(error): 18 | if not attr.startswith('__'): 19 | try: 20 | details['attributes'][attr] = str(getattr(error, attr)) 21 | except Exception: 22 | # getting an exception here is entirely possible, and since 23 | # there is no remote logging there is nothing we can do other 24 | # than eat it up. This section is going through each of the 25 | # attributes of the exception raised so it is mildly acceptable 26 | # to skip if anything is breaking 27 | details['attributes'][attr] = None 28 | return details 29 | 30 | 31 | def decoded(string): 32 | try: 33 | return string.decode('utf-8') 34 | except AttributeError: 35 | return string 36 | 37 | 38 | # Paths 39 | # 40 | def stat_path(path, skip_dirs=None, skip_files=None, get_contents=False): 41 | """stat a path on a remote host""" 42 | # Capture all information about a path, optionally getting the contents of 43 | # the remote path if it is a file. Exceptions get appended to each dictionary 44 | # object associated with the path 45 | 46 | # .. note:: Neither ``skip_dirs`` nor ``skip_files`` is used here, but the 47 | # remote execution of functions use name-based arguments which does not allow 48 | # the use of ``**kw`` 49 | metadata = {u'exception': {}} 50 | path = decoded(path) 51 | try: 52 | stat_info = os.stat(path) 53 | if get_contents and os.path.isfile(path): 54 | with open(path, 'r') as opened_file: 55 | metadata[u'contents'] = decoded(opened_file.read()) 56 | except Exception as error: 57 | return {'exception': capture_exception(error)} 58 | 59 | allowed_attrs = [ 60 | 'n_fields', 'n_sequence_fields', 'n_unnamed_fields', 'st_atime', 61 | 'st_blksize', 'st_blocks', 'st_ctime', 'st_dev', 'st_gid', 'st_ino', 62 | 'st_mode', 'st_mtime', 'st_nlink', 'st_rdev', 'st_size', 'st_uid' 63 | ] 64 | 65 | # get all the stat results back into the metadata 66 | for attr in dir(stat_info): 67 | attr = decoded(attr) 68 | if attr in allowed_attrs: 69 | value = decoded(getattr(stat_info, attr)) 70 | metadata[attr] = value 71 | 72 | # translate the owner and group: 73 | try: 74 | metadata[u'owner'] = decoded(pwd.getpwuid(stat_info.st_uid)[0]) 75 | except KeyError: 76 | metadata[u'owner'] = stat_info.st_uid 77 | try: 78 | metadata[u'group'] = decoded(grp.getgrgid(stat_info.st_gid)[0]) 79 | except KeyError: 80 | metadata[u'group'] = stat_info.st_gid 81 | 82 | return metadata 83 | 84 | 85 | def path_tree(path, skip_dirs=None, skip_files=None, get_contents=None): 86 | """generate a path tree""" 87 | # Generate a tree of paths, including directories and files, recursively, but 88 | # with the ability to exclude dirs and files with ``skip_dirs`` and 89 | # ``skip_files``. 90 | # The tree output groups the files and directories like:: 91 | 92 | # { 93 | # 'path': '/etc/ceph', 94 | # 'dirs': ['/etc/ceph/ceph.d/'], 95 | # 'files': ['/etc/ceph/ceph.d/test.conf', '/etc/ceph/rbdmap'] 96 | # } 97 | 98 | # .. note:: ``get_contents`` is not used here, but the remote execution of functions 99 | # use name-based arguments which does not allow the use of ``**kw`` 100 | try: 101 | path = path.decode('utf-8') 102 | except AttributeError: 103 | pass 104 | skip_files = skip_files or [] 105 | skip_dirs = skip_dirs or [] 106 | files = [] 107 | dirs = [] 108 | # traverse for files and directories, topdown allows us to trim the 109 | # directories on the fly 110 | for root, _dirs, _files in os.walk(path, topdown=True): 111 | _dirs[:] = [d for d in _dirs if d not in skip_dirs] 112 | for _file in _files: 113 | absolute_path = os.path.join(root, _file) 114 | if _file in skip_files: 115 | continue 116 | files.append(absolute_path) 117 | 118 | for _dir in _dirs: 119 | absolute_path = os.path.join(root, _dir) 120 | dirs.append(absolute_path) 121 | 122 | # using the 'u' prefix forces python3<->python2 compatibility otherwise the 123 | # keys would be bytes, regardless if input is a str which should've forced 124 | # a 'str' behavior. The prefix is invalid syntax for Python 3.0 to 3.2, so 125 | # this will be valid in Python 3.3 and newer and Python 2 126 | return {u'path': path, u'dirs': dirs, u'files': files} 127 | 128 | 129 | def which(executable): 130 | """find the location of an executable""" 131 | locations = ( 132 | '/usr/local/bin', 133 | '/bin', 134 | '/usr/bin', 135 | '/usr/local/sbin', 136 | '/usr/sbin', 137 | '/sbin', 138 | ) 139 | 140 | for location in locations: 141 | executable_path = os.path.join(location, executable) 142 | if os.path.exists(executable_path): 143 | return executable_path 144 | 145 | 146 | def run(command): 147 | """ 148 | run a command, return stdout, stderr, and exit code. 149 | """ 150 | process = subprocess.Popen( 151 | command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True 152 | ) 153 | stdout = process.stdout.read().splitlines() 154 | stderr = process.stderr.read().splitlines() 155 | returncode = process.wait() 156 | 157 | return stdout, stderr, returncode 158 | 159 | 160 | # remoto magic, needed to execute these functions remotely 161 | if __name__ == '__channelexec__': 162 | for item in channel: # noqa 163 | channel.send(eval(item)) # noqa 164 | -------------------------------------------------------------------------------- /ceph_medic/tests/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import random 3 | from ceph_medic import runner 4 | import ceph_medic 5 | from ceph_medic.tests import base_metadata 6 | 7 | 8 | class FakeWriter(object): 9 | 10 | def __init__(self): 11 | self.calls = [] 12 | self.write = self.raw 13 | self.loader = self 14 | 15 | def raw(self, string): 16 | self.calls.append(string) 17 | 18 | def bold(self, string): 19 | self.calls.append(string) 20 | 21 | def get_output(self): 22 | return '\n'.join(self.calls) 23 | 24 | 25 | @pytest.fixture(scope='class', autouse=True) 26 | def clear_metadata(): 27 | ceph_medic.metadata = base_metadata 28 | 29 | 30 | @pytest.fixture 31 | def mon_keyring(): 32 | def make_keyring(default=False): 33 | if default: 34 | key = "AQBvaBFZAAAAABAA9VHgwCg3rWn8fMaX8KL01A==" 35 | else: 36 | key = "%032x==" % random.getrandbits(128) 37 | 38 | return """ 39 | [mon.] 40 | key = %s 41 | caps mon = "allow *" 42 | """ % key 43 | return make_keyring 44 | 45 | 46 | @pytest.fixture 47 | def terminal(monkeypatch): 48 | fake_writer = FakeWriter() 49 | monkeypatch.setattr(runner.terminal, 'write', fake_writer) 50 | return fake_writer 51 | 52 | 53 | @pytest.fixture 54 | def data(): 55 | """ 56 | Default data structure for remote nodes 57 | """ 58 | def _data(): 59 | return { 60 | 'ceph': {'installed': True, 'version': '12.2.1', 'sockets':{}}, 61 | 'paths': { 62 | '/etc/ceph': {'files': {}, 'dirs': {}}, 63 | '/var/lib/ceph': {'files': {}, 'dirs': {}}, 64 | } 65 | } 66 | return _data 67 | 68 | 69 | @pytest.fixture 70 | def make_data(data, **kw): 71 | """ 72 | Customize basic data structure on remote nodes 73 | """ 74 | def update(dictionary=None): 75 | base = data() 76 | if not dictionary: 77 | return base 78 | base.update(dictionary) 79 | return base 80 | return update 81 | 82 | 83 | @pytest.fixture 84 | def make_nodes(): 85 | """ 86 | Helper to generate nodes for daemons 87 | """ 88 | def make_data(**kw): 89 | """ 90 | ``kw`` is expected to be a mapping between daemon name and hosts for 91 | that daemon, like:: 92 | 93 | make_data(mons=['node1', 'node2'] 94 | """ 95 | # default set of nodes 96 | data = dict( 97 | (k, {}) for k in ['rgws', 'mgrs', 'mdss', 'clients', 'osds', 'mons'] 98 | ) 99 | for daemon, node_names in kw.items(): 100 | data[daemon] = [dict(host=node_name) for node_name in node_names] 101 | return data 102 | return make_data 103 | 104 | 105 | class Capture(object): 106 | 107 | def __init__(self, *a, **kw): 108 | self.a = a 109 | self.kw = kw 110 | self.calls = [] 111 | self.return_values = kw.get('return_values', False) 112 | self.always_returns = kw.get('always_returns', False) 113 | 114 | def __call__(self, *a, **kw): 115 | self.calls.append({'args': a, 'kwargs': kw}) 116 | if self.always_returns: 117 | return self.always_returns 118 | if self.return_values: 119 | return self.return_values.pop() 120 | 121 | 122 | class Factory(object): 123 | 124 | def __init__(self, **kw): 125 | for k, v in kw.items(): 126 | setattr(self, k, v) 127 | 128 | 129 | @pytest.fixture 130 | def factory(): 131 | return Factory 132 | 133 | 134 | @pytest.fixture 135 | def conn(): 136 | """ 137 | Useful when trying to pass a ``conn`` object around that will porbably want 138 | to log output 139 | """ 140 | log = lambda x: x 141 | logger = Factory(error=log, exception=log) 142 | return Factory(logger=logger) 143 | 144 | 145 | @pytest.fixture 146 | def capture(): 147 | return Capture() 148 | 149 | 150 | @pytest.fixture 151 | def fake_run(monkeypatch): 152 | fake_run = Capture() 153 | monkeypatch.setattr('remoto.process.run', fake_run) 154 | return fake_run 155 | 156 | 157 | @pytest.fixture 158 | def fake_check(monkeypatch): 159 | fake_call = Capture(always_returns=([], [], 0)) 160 | monkeypatch.setattr('remoto.process.check', fake_call) 161 | return fake_call 162 | 163 | 164 | @pytest.fixture 165 | def stub_check(monkeypatch): 166 | """ 167 | Monkeypatches process.check, so that a caller can add behavior to the 168 | response 169 | """ 170 | def apply(return_values, module=None, string_module='remoto.process.check'): 171 | """ 172 | ``return_values`` should be a tuple of 3 elements: stdout, stderr, and 173 | code. This should mimic the ``check()`` return values. For example:: 174 | 175 | (['stdout'], ['stderr'], 0) 176 | 177 | Each item in the stdout or stderr lists represents a line. 178 | Additionally, if more than one response is wanted, a list with multiple 179 | tuples can be provided:: 180 | 181 | 182 | [ 183 | (['output'], [], 0), 184 | ([], ['error condition'], 1), 185 | (['output'], [], 0), 186 | ] 187 | 188 | When patching, most of the time the default ``string_module`` will be 189 | fine, but if it is required to patch an actual module with the added 190 | string, then it is possible to use them accordingly: whne the module is 191 | set, the call to ``monkeypatch`` will use both like:: 192 | 193 | monkeypatch.setattr(module, 'function', value) 194 | 195 | Otherwise it will just patch it like:: 196 | 197 | monkeypatch.setattr('remoto.process.check', value) 198 | 199 | """ 200 | if isinstance(return_values, tuple): 201 | return_values = [return_values] 202 | stubbed_call = Capture(return_values=return_values) 203 | if module: 204 | monkeypatch.setattr(module, string_module, stubbed_call) 205 | else: 206 | monkeypatch.setattr(string_module, stubbed_call) 207 | return stubbed_call 208 | 209 | return apply 210 | 211 | 212 | @pytest.fixture(autouse=True) 213 | def reset_file_config(request, monkeypatch): 214 | """ 215 | The globally available ``ceph_medic.config.file`` might get mangled in 216 | tests, make sure that after evert test, it gets reset, preventing pollution 217 | going into other tests later. 218 | """ 219 | def fin(): 220 | ceph_medic.config.file = ceph_medic.UnloadedConfig() 221 | request.addfinalizer(fin) 222 | -------------------------------------------------------------------------------- /ceph_medic/main.py: -------------------------------------------------------------------------------- 1 | from ceph_medic import check, log 2 | import sys 3 | import os 4 | from textwrap import dedent 5 | from tambo import Transport 6 | from execnet.gateway_bootstrap import HostNotFound 7 | import ceph_medic 8 | from ceph_medic.decorators import catches 9 | from ceph_medic.util import configuration, hosts 10 | from ceph_medic import terminal 11 | 12 | 13 | class Medic(object): 14 | _help = """ 15 | ceph-medic: A utility to run system checks on a Ceph cluster. 16 | 17 | Version: {version} 18 | 19 | Global Options: 20 | --config Path to a specific configuration file. Overrides the default: 21 | $HOME/.cephmedic.conf. 22 | --cluster Use a specific cluster name (defaults to 'ceph'). Alternatively, 23 | this is inferred from a conf file name in /etc/ceph/ 24 | --ssh-config Specify an alternate configuration for SSH 25 | --version, version Shows the current installed version 26 | --inventory Prefer a ceph-ansible inventory (hosts) file instead of default 27 | (cwd, /etc/ansible/hosts) locations 28 | --verbosity Set verbosity level of logging output 29 | 30 | {sub_help} 31 | 32 | {config_path_header}: {config_path} 33 | {hosts_file_header}: {hosts_file} 34 | {configured_nodes} 35 | """ 36 | mapper = { 37 | 'check': check.Check, 38 | # TODO: this needs a bit more work, disabling for now 39 | #'generate': generate.Generate, 40 | } 41 | 42 | def __init__(self, argv=None, parse=True): 43 | if argv is None: 44 | argv = sys.argv 45 | if parse: 46 | self.main(argv) 47 | 48 | def help(self, sub_help=None): 49 | if self.hosts_file is None: 50 | hosts_file_header = terminal.red('Loaded Inventory Hosts file') 51 | hosts_file = 'No hosts file found in cwd, /etc/ansible/, or configured' 52 | else: 53 | hosts_file_header = terminal.green('Loaded Inventory Hosts file') 54 | hosts_file = self.hosts_file 55 | return self._help.format( 56 | version=ceph_medic.__version__, 57 | config_path=self.config_path, 58 | config_path_header=terminal.green('Loaded Config Path'), 59 | hosts_file=hosts_file, 60 | hosts_file_header=hosts_file_header, 61 | sub_help=sub_help, 62 | configured_nodes=self.configured_nodes 63 | ) 64 | 65 | @property 66 | def configured_nodes(self): 67 | _help = dedent(""" 68 | Configured nodes (loaded from inventory hosts file): 69 | OSDs: {osd_node_count} 70 | MONs: {mon_node_count} 71 | MGRs: {mgr_node_count} 72 | MDSs: {mds_node_count} 73 | RGWs: {rgw_node_count}""") 74 | if self.hosts_file: # we have nodes that have been loaded 75 | nodes = ceph_medic.config.nodes 76 | return _help.format( 77 | osd_node_count=len(nodes.get('osds', [])), 78 | mon_node_count=len(nodes.get('mons', [])), 79 | mds_node_count=len(nodes.get('mdss', [])), 80 | mgr_node_count=len(nodes.get('mgrs', [])), 81 | rgw_node_count=len(nodes.get('rgws', [])) 82 | ) 83 | return '' 84 | 85 | @catches((RuntimeError, KeyboardInterrupt, HostNotFound)) 86 | def main(self, argv): 87 | options = [ 88 | '--cluster', '--ssh-config', '--inventory', 89 | '--config', '--verbosity', 90 | ] 91 | parser = Transport( 92 | argv, options=options, 93 | check_help=False, 94 | check_version=False 95 | ) 96 | parser.parse_args() 97 | 98 | self.config_path = parser.get('--config', configuration.location()) 99 | 100 | # load medic configuration 101 | loaded_config = configuration.load(path=parser.get('--config', self.config_path)) 102 | 103 | # this is the earliest we can have enough config to setup logging 104 | log.setup(loaded_config) 105 | ceph_medic.config.file = loaded_config 106 | global_options = dict(ceph_medic.config.file._sections['global']) 107 | 108 | # SSH config 109 | ceph_medic.config.ssh_config = parser.get('--ssh-config', global_options.get('--ssh-config')) 110 | if ceph_medic.config.ssh_config: 111 | ssh_config_path = ceph_medic.config.ssh_config 112 | if not os.path.exists(ssh_config_path): 113 | terminal.error("the given ssh config path does not exist: %s" % ssh_config_path) 114 | sys.exit() 115 | 116 | ceph_medic.config.cluster_name = parser.get('--cluster', 'ceph') 117 | ceph_medic.metadata['cluster_name'] = 'ceph' 118 | 119 | # Deployment Type 120 | deployment_type = ceph_medic.config.file.get_safe('global', 'deployment_type', 'baremetal') 121 | if deployment_type in ['kubernetes', 'openshift', 'k8s', 'oc']: 122 | pod_hosts = hosts.container_platform(deployment_type) 123 | ceph_medic.config.nodes = pod_hosts 124 | ceph_medic.config.hosts_file = ':memory:' 125 | self.hosts_file = ':memory:' 126 | else: 127 | # Hosts file 128 | self.hosts_file = parser.get('--inventory', configuration.get_host_file()) 129 | 130 | # find the hosts files, by the CLI first, fallback to the configuration 131 | # file, and lastly if none of those are found or defined, try to load 132 | # from well known locations (cwd, and /etc/ansible/) 133 | loaded_hosts = configuration.load_hosts( 134 | parser.get('--inventory', 135 | global_options.get('--inventory', self.hosts_file))) 136 | ceph_medic.config.nodes = loaded_hosts.nodes 137 | ceph_medic.config.hosts_file = loaded_hosts.filename 138 | self.hosts_file = loaded_hosts.filename 139 | 140 | if deployment_type in ['docker', 'podman']: 141 | ceph_medic.config.nodes = hosts.basic_containers( 142 | deployment_type) 143 | 144 | parser.catch_version = ceph_medic.__version__ 145 | parser.mapper = self.mapper 146 | parser.catch_help = self.help(parser.subhelp()) 147 | if len(argv) <= 1: 148 | return parser.print_help() 149 | ceph_medic.config.config_path = self.config_path 150 | parser.dispatch() 151 | parser.catches_help() 152 | parser.catches_version() 153 | 154 | # Verbosity 155 | verbosity = parser.get('--verbosity', 'debug') 156 | ceph_medic.config.verbosity = verbosity.lower() 157 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. ceph-medic documentation master file, created by 2 | sphinx-quickstart on Tue Jun 27 14:32:23 2017. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | ================================================= 7 | Introduction 8 | ================================================= 9 | 10 | ``ceph-medic`` is a very simple tool that runs against a Ceph cluster to detect 11 | common issues that might prevent correct functionality. It requires 12 | non-interactive SSH access to accounts that can ``sudo`` without a password 13 | prompt. 14 | 15 | Usage 16 | ===== 17 | 18 | The basic usage of ``ceph-medic`` is to perform checks against a ceph cluster 19 | to identify potential issues with its installation or configuration. To do 20 | this, run the following command:: 21 | 22 | ceph-medic --inventory /path/to/hosts --ssh-config /path/to/ssh_config check 23 | 24 | Inventory 25 | --------- 26 | ``ceph-medic`` needs to know the nodes that exist in your ceph cluster before 27 | it can perform checks. The inventory (or ``hosts`` file) is a typical Ansible 28 | inventory file and will be used to inform ``ceph-medic`` of the nodes in your 29 | cluster and their respective roles. The following standard host groups are 30 | supported by ``ceph-medic``: ``mons``, ``osds``, ``rgws``, ``mdss``, ``mgrs`` 31 | and ``clients``. An example ``hosts`` file would look like:: 32 | 33 | [mons] 34 | mon0 35 | mon1 36 | 37 | [osds] 38 | osd0 39 | 40 | [mgrs] 41 | mgr0 42 | 43 | The location of the ``hosts`` file can be passed into ``ceph-medic`` by using 44 | the ``--inventory`` cli option (e.g ``ceph-medic --inventory /path/to/hosts``). 45 | 46 | If the ``--inventory`` option is not defined, ``ceph-medic`` will first look in 47 | the current working directory for a file named ``hosts``. If the file does not 48 | exist, it will look for ``/etc/ansible/hosts`` to be used as the inventory. 49 | 50 | .. note:: Defining the inventory location is also possible via the config file 51 | under the ``[global]`` section. 52 | 53 | 54 | Inventory for Containers 55 | ------------------------ 56 | Containerized deployments are also supported, via ``docker`` and ``podman``. 57 | As with ``baremetal`` deployments, an inventory file is required. If the 58 | cluster was deployed with ``ceph-ansible``, you may use that existing 59 | inventory. 60 | 61 | To configure ceph-medic to connect to a containerized cluster, the glocal section of the 62 | configuration needs to define ``deployment_type`` to either ``docker`` or 63 | ``podman``. For example:: 64 | 65 | [global] 66 | 67 | deployment_type = podman 68 | 69 | 70 | Inventory for Container Platforms 71 | --------------------------------- 72 | Both ``kubernetes`` and ``openshift`` platforms can host containers remotely, 73 | but do allow to connect and retrieve information from a central location. 74 | To configure ceph-medic to connect to a platform, the glocal section of the 75 | configuration needs to define ``deployment_type`` to either ``kubernetes``, which 76 | uses the ``kubectl`` command, or ``openshift``, which uses the ``oc`` command. For example:: 77 | 78 | [global] 79 | 80 | deployment_type = openshift 81 | 82 | 83 | When using ``openshift`` or ``kubernetes`` as a deployment type, there is no 84 | requirement to define a ``hosts`` file. The hosts are generated dynamically by 85 | calling out to the platform and retrieving the pods. When the pods are 86 | identified, they are grouped by deamon type (osd, mgr, rgw, mon, etc...). 87 | 88 | SSH Config 89 | ---------- 90 | 91 | All nodes in your ``hosts`` file must be configured to provide non-interactive 92 | SSH access to accounts that can ``sudo`` without a password prompt. 93 | 94 | .. note:: 95 | This is the same ssh config required by ansible. If you've used ``ceph-ansible`` to deploy your 96 | cluster then your nodes are most likely already configured for this type of ssh access. If that 97 | is the case, using the same user that performed the initial deployment would be easiest. 98 | 99 | To provide your ssh config you must use the ``--ssh-config`` flag and give it 100 | a path to a file that defines your ssh configuration. For example, a file like 101 | this is used to connect with a cluster comprised of vagrant vms:: 102 | 103 | Host mon0 104 | HostName 127.0.0.1 105 | User vagrant 106 | Port 2200 107 | UserKnownHostsFile /dev/null 108 | StrictHostKeyChecking no 109 | PasswordAuthentication no 110 | IdentityFile /Users/andrewschoen/.vagrant.d/insecure_private_key 111 | IdentitiesOnly yes 112 | LogLevel FATAL 113 | 114 | Host osd0 115 | HostName 127.0.0.1 116 | User vagrant 117 | Port 2201 118 | UserKnownHostsFile /dev/null 119 | StrictHostKeyChecking no 120 | PasswordAuthentication no 121 | IdentityFile /Users/andrewschoen/.vagrant.d/insecure_private_key 122 | IdentitiesOnly yes 123 | LogLevel FATAL 124 | 125 | 126 | .. note:: SSH configuration is not needed when using ``kubernetes`` or 127 | ``openshift`` 128 | 129 | 130 | Logging 131 | ------- 132 | 133 | By default ``ceph-medic`` sends complete logs to the current working directory. 134 | This log file is more verbose than the output displayed on the terminal. To 135 | change where these logs are created, modify the default value for ``--log-path`` 136 | in ``~/.cephmedic.conf``. 137 | 138 | Running checks 139 | -------------- 140 | 141 | To perform checks against your cluster use the ``check`` subcommand. This will 142 | perform a series of general checks, as well as checks specific to each daemon. 143 | Sample output from this command will look like:: 144 | 145 | ceph-medic --ssh-config vagrant_ssh_config check 146 | Host: mgr0 connection: [connected ] 147 | Host: mon0 connection: [connected ] 148 | Host: osd0 connection: [connected ] 149 | Collection completed! 150 | 151 | ======================= Starting remote check session ======================== 152 | Version: 0.0.1 Cluster Name: "test" 153 | Total hosts: [3] 154 | OSDs: 1 MONs: 1 Clients: 0 155 | MDSs: 0 RGWs: 0 MGRs: 1 156 | 157 | ================================================================================ 158 | 159 | ---------- managers ---------- 160 | mgr0 161 | 162 | ------------ osds ------------ 163 | osd0 164 | 165 | ------------ mons ------------ 166 | mon0 167 | 168 | 17 passed, 0 errors, on 4 hosts 169 | 170 | 171 | The logging can also be configured in the ``cephmedic.conf`` file in the global 172 | section:: 173 | 174 | [global] 175 | --log-path = . 176 | 177 | To ensure that cluster checks run properly, at least one monitor node should have administrative privileges. 178 | -------------------------------------------------------------------------------- /ceph_medic/tests/test_runner.py: -------------------------------------------------------------------------------- 1 | import ceph_medic 2 | from ceph_medic import runner 3 | from ceph_medic.tests import base_metadata 4 | from textwrap import dedent 5 | from ceph_medic.util import configuration 6 | 7 | 8 | class TestRunner(object): 9 | 10 | def setup(self): 11 | runner.metadata = base_metadata 12 | 13 | def teardown(self): 14 | runner.metadata = base_metadata 15 | 16 | def test_calculate_total_hosts_is_0(self): 17 | run = runner.Runner() 18 | assert run.total_hosts == 0 19 | 20 | def test_calculate_hosts_single_daemon_type(self): 21 | ceph_medic.metadata['nodes']['osds'] = [{'host': 'node1'},{'host': 'node2'}] 22 | runner.metadata = ceph_medic.metadata 23 | run = runner.Runner() 24 | assert run.total_hosts == 2 25 | 26 | def test_count_from_different_daemon_types(self): 27 | ceph_medic.metadata['nodes']['osds'] = [{'host': 'node1'},{'host': 'node2'}] 28 | ceph_medic.metadata['nodes']['mons'] = [{'host': 'node3'},{'host': 'node4'}] 29 | runner.metadata = ceph_medic.metadata 30 | run = runner.Runner() 31 | assert run.total_hosts == 4 32 | 33 | 34 | class TestReport(object): 35 | 36 | def setup(self): 37 | runner.metadata = base_metadata 38 | runner.metadata['nodes'] = {} 39 | self.results = runner.Runner() 40 | 41 | def test_reports_unhandled_internal_errors(self, terminal): 42 | self.results.internal_errors = ['I am an error'] 43 | runner.report(self.results) 44 | assert 'While running checks, ceph-medic had 1 unhandled errors' in terminal.calls[-1] 45 | 46 | def test_reports_no_errors(self, terminal): 47 | runner.report(self.results) 48 | assert terminal.calls[0] == '\n0 passed, on 0 hosts' 49 | 50 | def test_reports_warning(self, terminal): 51 | self.results.warnings = 1 52 | runner.report(self.results) 53 | assert terminal.calls[0] == '\n0 passed, 1 warning, on 0 hosts' 54 | 55 | def test_reports_warnings(self, terminal): 56 | self.results.warnings = 2 57 | runner.report(self.results) 58 | assert terminal.calls[0] == '\n0 passed, 2 warnings, on 0 hosts' 59 | 60 | def test_reports_error(self, terminal): 61 | self.results.errors = 1 62 | runner.report(self.results) 63 | assert terminal.calls[0] == '\n0 passed, 1 error, on 0 hosts' 64 | 65 | def test_reports_errors(self, terminal): 66 | self.results.errors = 2 67 | runner.report(self.results) 68 | assert terminal.calls[0] == '\n0 passed, 2 errors, on 0 hosts' 69 | 70 | def test_reports_error_and_warning(self, terminal): 71 | self.results.errors = 1 72 | self.results.warnings = 1 73 | runner.report(self.results) 74 | assert terminal.calls[0] == '\n0 passed, 1 error, 1 warning, on 0 hosts' 75 | 76 | def test_reports_errors_and_warnings(self, terminal): 77 | self.results.errors = 2 78 | self.results.warnings = 2 79 | runner.report(self.results) 80 | assert terminal.calls[0] == '\n0 passed, 2 errors, 2 warnings, on 0 hosts' 81 | 82 | def test_reports_internal_errors(self, terminal): 83 | self.results.internal_errors = ['error 1', 'error 2'] 84 | self.results.warnings = 2 85 | runner.report(self.results) 86 | assert terminal.calls[0] == '\n0 passed, 2 warnings, 2 internal errors, on 0 hosts' 87 | 88 | 89 | class TestReportBasicOutput(object): 90 | 91 | def setup(self): 92 | contents = dedent(""" 93 | [global] 94 | # 95 | """) 96 | conf = configuration.load_string(contents) 97 | ceph_medic.config.file = conf 98 | runner.metadata = base_metadata 99 | runner.metadata['cluster_name'] = 'ceph' 100 | runner.Runner().run() 101 | 102 | def teardown(self): 103 | runner.metadata = base_metadata 104 | 105 | def test_has_version(self, terminal): 106 | assert 'Version: ' in terminal.get_output() 107 | 108 | def test_has_cluster_name(self, terminal): 109 | assert 'Cluster Name: "ceph"' in terminal.get_output() 110 | 111 | def test_has_no_hosts(self, terminal): 112 | assert 'Total hosts: [0]' in terminal.get_output() 113 | 114 | def test_has_a_header(self, terminal): 115 | assert '== Starting remote check session ==' in terminal.get_output() 116 | 117 | def test_has_no_OSDs(self, terminal): 118 | assert 'OSDs: 0' in terminal.get_output() 119 | 120 | def test_has_no_MONs(self, terminal): 121 | assert 'MONs: 0' in terminal.get_output() 122 | 123 | def test_has_no_Clients(self, terminal): 124 | assert 'Clients: 0' in terminal.get_output() 125 | 126 | def test_has_no_MDSs(self, terminal): 127 | assert 'MDSs: 0' in terminal.get_output() 128 | 129 | def test_has_no_MGRs(self, terminal): 130 | assert 'MGRs: 0' in terminal.get_output() 131 | 132 | def test_has_no_RGWs(self, terminal): 133 | assert 'RGWs: 0' in terminal.get_output() 134 | 135 | 136 | class TestReportErrors(object): 137 | 138 | def setup(self): 139 | contents = dedent(""" 140 | [global] 141 | # 142 | """) 143 | conf = configuration.load_string(contents) 144 | ceph_medic.config.file = conf 145 | runner.metadata = base_metadata 146 | runner.metadata['cluster_name'] = 'ceph' 147 | runner.Runner().run() 148 | 149 | def teardown(self): 150 | runner.metadata = base_metadata 151 | 152 | def test_get_new_lines_in_errors(self, terminal, mon_keyring, data, monkeypatch): 153 | data_node1 = data() 154 | data_node2 = data() 155 | data_node1['paths']['/var/lib/ceph']['files'] = { 156 | '/var/lib/ceph/mon/ceph-0/keyring': {'contents': mon_keyring()} 157 | } 158 | data_node1['paths']['/var/lib/ceph']['dirs'] = { 159 | '/var/lib/ceph/osd/ceph-10': {}, 160 | '/var/lib/ceph/osd/ceph-11': {}, 161 | '/var/lib/ceph/osd/ceph-12': {}, 162 | '/var/lib/ceph/osd/ceph-13': {}, 163 | '/var/lib/ceph/osd/ceph-0': {}, 164 | '/var/lib/ceph/osd/ceph-1': {}, 165 | '/var/lib/ceph/osd/ceph-2': {}, 166 | '/var/lib/ceph/osd/ceph-3': {}, 167 | } 168 | 169 | data_node2['paths']['/var/lib/ceph']['files'] = { 170 | '/var/lib/ceph/mon/ceph-1/keyring': {'contents': mon_keyring()}, 171 | } 172 | data_node2['paths']['/var/lib/ceph']['dirs'] = { 173 | '/var/lib/ceph/osd/ceph-10': {}, 174 | '/var/lib/ceph/osd/ceph-11': {}, 175 | '/var/lib/ceph/osd/ceph-12': {}, 176 | '/var/lib/ceph/osd/ceph-13': {}, 177 | '/var/lib/ceph/osd/ceph-0': {}, 178 | '/var/lib/ceph/osd/ceph-1': {}, 179 | '/var/lib/ceph/osd/ceph-2': {}, 180 | '/var/lib/ceph/osd/ceph-3': {}, 181 | } 182 | 183 | # set the data everywhere we need it 184 | ceph_medic.metadata['mons'] = {'node1': data_node1, 'node2': data_node2} 185 | monkeypatch.setattr(ceph_medic.checks.mons, 'metadata', ceph_medic.metadata) 186 | 187 | runner.Runner().run() 188 | # Any line that is an error or a warning *must* end with a newline 189 | for line in terminal.calls: 190 | if line.lstrip().startswith(('E', 'W')): 191 | assert line.endswith('\n') 192 | -------------------------------------------------------------------------------- /docs/source/_themes/ceph/static/nature.css_t: -------------------------------------------------------------------------------- 1 | /* 2 | * nature.css_t 3 | * ~~~~~~~~~~~~ 4 | * 5 | * Sphinx stylesheet -- nature theme. 6 | * 7 | * :copyright: Copyright 2007-2011 by the Sphinx team, see AUTHORS. 8 | * :license: BSD, see LICENSE for details. 9 | * 10 | */ 11 | 12 | @import url("basic.css"); 13 | 14 | /* -- page layout ----------------------------------------------------------- */ 15 | 16 | @font-face { 17 | font-family: 'ApexSansMedium'; 18 | src: url('font/ApexSans-Medium.eot'); 19 | src: url('font/ApexSans-Medium.eot?#iefix') format('embedded-opentype'), 20 | url('font/ApexSans-Medium.woff') format('woff'), 21 | url('font/ApexSans-Medium.ttf') format('truetype'), 22 | url('font/ApexSans-Medium.svg#FontAwesome') format('svg'); 23 | font-weight: normal; 24 | font-style: normal; 25 | } 26 | 27 | @font-face { 28 | font-family: 'ApexSansBook'; 29 | src: url('font/ApexSans-Book.eot'); 30 | src: url('font/ApexSans-Book.eot?#iefix') format('embedded-opentype'), 31 | url('font/ApexSans-Book.woff') format('woff'), 32 | url('font/ApexSans-Book.ttf') format('truetype'), 33 | url('font/ApexSans-Book.svg#FontAwesome') format('svg'); 34 | font-weight: normal; 35 | font-style: normal; 36 | } 37 | 38 | body { 39 | font: 14px/1.4 Helvetica, Arial, sans-serif; 40 | background-color: #E6E8E8; 41 | color: #37424A; 42 | margin: 0; 43 | padding: 0; 44 | border-top: 5px solid #F05C56; 45 | } 46 | 47 | div.documentwrapper { 48 | float: left; 49 | width: 100%; 50 | } 51 | 52 | div.bodywrapper { 53 | margin: 0 0 0 330px; 54 | } 55 | 56 | hr { 57 | border: 1px solid #B1B4B6; 58 | } 59 | 60 | div.document { 61 | background-color: #ffffff; 62 | } 63 | 64 | div.body { 65 | background-color: #ffffff; 66 | color: #3E4349; 67 | padding: 0 30px 30px 30px; 68 | } 69 | 70 | div.footer { 71 | color: #222B31; 72 | width: 100%; 73 | padding: 13px 0; 74 | text-align: center; 75 | font-size: 75%; 76 | } 77 | 78 | div.footer a { 79 | color: #444; 80 | text-decoration: underline; 81 | } 82 | 83 | div.related { 84 | background-color: #80D2DC; 85 | line-height: 32px; 86 | color: #37424A; 87 | // text-shadow: 0px 1px 0 #444; 88 | font-size: 100%; 89 | border-top: #9C4850 5px solid; 90 | } 91 | 92 | div.related a { 93 | color: #37424A; 94 | text-decoration: none; 95 | } 96 | 97 | div.related a:hover { 98 | color: #fff; 99 | // text-decoration: underline; 100 | } 101 | 102 | div.sphinxsidebar { 103 | // font-size: 100%; 104 | line-height: 1.5em; 105 | width: 330px; 106 | } 107 | 108 | div.sphinxsidebarwrapper{ 109 | padding: 20px 0; 110 | background-color: #efefef; 111 | } 112 | 113 | div.sphinxsidebar h3, 114 | div.sphinxsidebar h4 { 115 | font-family: ApexSansMedium; 116 | color: #e6e8e8; 117 | font-size: 1.2em; 118 | font-weight: normal; 119 | margin: 0; 120 | padding: 5px 10px; 121 | background-color: #5e6a71; 122 | // text-shadow: 1px 1px 0 white; 123 | text-transform: uppercase; 124 | } 125 | 126 | div.sphinxsidebar h4{ 127 | font-size: 1.1em; 128 | } 129 | 130 | div.sphinxsidebar h3 a { 131 | color: #e6e8e8; 132 | } 133 | 134 | 135 | div.sphinxsidebar p { 136 | color: #888; 137 | padding: 5px 20px; 138 | } 139 | 140 | div.sphinxsidebar p.topless { 141 | } 142 | 143 | div.sphinxsidebar ul { 144 | margin: 10px 5px 10px 20px; 145 | padding: 0; 146 | color: #000; 147 | } 148 | 149 | div.sphinxsidebar a { 150 | color: #444; 151 | } 152 | 153 | div.sphinxsidebar input { 154 | border: 1px solid #ccc; 155 | font-family: sans-serif; 156 | font-size: 1em; 157 | } 158 | 159 | div.sphinxsidebar input[type=text]{ 160 | margin-left: 20px; 161 | } 162 | 163 | /* -- body styles ----------------------------------------------------------- */ 164 | 165 | a { 166 | color: #F05C56; 167 | text-decoration: none; 168 | } 169 | 170 | a:hover { 171 | color: #F05C56; 172 | text-decoration: underline; 173 | } 174 | 175 | div.body h1, 176 | div.body h2, 177 | div.body h3, 178 | div.body h4, 179 | div.body h5, 180 | div.body h6 { 181 | // font-family: ApexSansMedium; 182 | // background-color: #80D2DC; 183 | // font-weight: normal; 184 | // color: #37424a; 185 | margin: 30px 0px 10px 0px; 186 | padding: 5px 0 5px 0px; 187 | // text-shadow: 0px 1px 0 white; 188 | text-transform: uppercase; 189 | } 190 | 191 | div.body h1 { font: 20px/2.0 ApexSansBook; color: #37424A; border-top: 20px solid white; margin-top: 0; } 192 | div.body h2 { font: 18px/1.8 ApexSansMedium; background-color: #5E6A71; color: #E6E8E8; padding: 5px 10px; } 193 | div.body h3 { font: 16px/1.6 ApexSansMedium; color: #37424A; } 194 | div.body h4 { font: 14px/1.4 Helvetica, Arial, sans-serif; color: #37424A; } 195 | div.body h5 { font: 12px/1.2 Helvetica, Arial, sans-serif; color: #37424A; } 196 | div.body h6 { font-size: 100%; color: #37424A; } 197 | 198 | // div.body h2 { font-size: 150%; background-color: #E6E8E8; color: #37424A; } 199 | // div.body h3 { font-size: 120%; background-color: #E6E8E8; color: #37424A; } 200 | // div.body h4 { font-size: 110%; background-color: #E6E8E8; color: #37424A; } 201 | // div.body h5 { font-size: 100%; background-color: #E6E8E8; color: #37424A; } 202 | // div.body h6 { font-size: 100%; background-color: #E6E8E8; color: #37424A; } 203 | 204 | a.headerlink { 205 | color: #c60f0f; 206 | font-size: 0.8em; 207 | padding: 0 4px 0 4px; 208 | text-decoration: none; 209 | } 210 | 211 | a.headerlink:hover { 212 | background-color: #c60f0f; 213 | color: white; 214 | } 215 | 216 | div.body p, div.body dd, div.body li { 217 | line-height: 1.5em; 218 | } 219 | 220 | div.admonition p.admonition-title + p { 221 | display: inline; 222 | } 223 | 224 | div.highlight{ 225 | background-color: white; 226 | } 227 | 228 | div.note { 229 | background-color: #e6e8e8; 230 | border: 1px solid #ccc; 231 | } 232 | 233 | div.seealso { 234 | background-color: #ffc; 235 | border: 1px solid #ff6; 236 | } 237 | 238 | div.topic { 239 | background-color: #efefef; 240 | } 241 | 242 | div.warning { 243 | background-color: #F05C56; 244 | border: 1px solid #9C4850; 245 | color: #fff; 246 | } 247 | 248 | p.admonition-title { 249 | display: inline; 250 | } 251 | 252 | p.admonition-title:after { 253 | content: ":"; 254 | } 255 | 256 | pre { 257 | padding: 10px; 258 | background-color: White; 259 | color: #222; 260 | line-height: 1.2em; 261 | border: 1px solid #5e6a71; 262 | font-size: 1.1em; 263 | margin: 1.5em; 264 | -webkit-box-shadow: 1px 1px 1px #e6e8e8; 265 | -moz-box-shadow: 1px 1px 1px #e6e8e8; 266 | } 267 | 268 | tt { 269 | background-color: #ecf0f3; 270 | color: #222; 271 | /* padding: 1px 2px; */ 272 | font-size: 15px; 273 | font-family: monospace; 274 | } 275 | 276 | .viewcode-back { 277 | font-family: Arial, sans-serif; 278 | } 279 | 280 | div.viewcode-block:target { 281 | background-color: #f4debf; 282 | border-top: 1px solid #ac9; 283 | border-bottom: 1px solid #ac9; 284 | } 285 | 286 | table.docutils { 287 | margin: 1.5em; 288 | } 289 | 290 | div.sidebar { 291 | border: 1px solid #5E6A71; 292 | background-color: #E6E8E8; 293 | } 294 | 295 | div.admonition.tip { 296 | background-color: #80D2DC; 297 | border: 1px solid #55AEBA; 298 | } 299 | 300 | div.admonition.important { 301 | background-color: #F05C56; 302 | border: 1px solid #9C4850; 303 | color: #fff; 304 | } 305 | 306 | div.tip tt.literal { 307 | background-color: #55aeba; 308 | color: #fff; 309 | } 310 | 311 | div.important tt.literal { 312 | background-color: #9C4850; 313 | color: #fff; 314 | } 315 | 316 | h2 .literal { 317 | color: #fff; 318 | background-color: #37424a; 319 | } 320 | 321 | dl.glossary dt { 322 | font-size: 1.0em; 323 | padding-top:20px; 324 | 325 | } -------------------------------------------------------------------------------- /ceph_medic/checks/common.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | from ceph_medic import metadata, daemon_types 3 | from ceph_medic.util import configuration, str_to_int 4 | 5 | 6 | # 7 | # Utilities 8 | # 9 | 10 | def get_fsid(data): 11 | # FIXME: might want to load this thing into ConfigParser so that we can fetch 12 | # information. ceph-deploy is a good example on how to do this. See: 13 | # https://github.com/ceph/ceph-deploy/blob/master/ceph_deploy/conf/ceph.py 14 | cluster_path = '/etc/ceph/%s.conf' % metadata['cluster_name'] 15 | try: 16 | contents = data['paths']['/etc/ceph']['files'][cluster_path]['contents'] 17 | except KeyError: 18 | return '' 19 | conf = configuration.load_string(contents) 20 | try: 21 | return conf.get_safe('global', 'fsid', '') 22 | except IndexError: 23 | return '' 24 | 25 | 26 | def get_common_fsid(): 27 | """ 28 | Determine what is the most common Cluster FSID. If all of them are the same 29 | then we are fine, but if there is a mix, we need some base to compare to. 30 | """ 31 | all_fsids = [] 32 | 33 | for daemon_type in daemon_types: 34 | for node_metadata in metadata[daemon_type].values(): 35 | fsids = get_host_fsids(node_metadata) 36 | all_fsids.extend(fsids) 37 | 38 | try: 39 | common_fsid = Counter(all_fsids).most_common()[0][0] 40 | except IndexError: 41 | return '' 42 | return common_fsid 43 | 44 | 45 | def get_host_fsids(node_metadata): 46 | """ 47 | Return all the cluster FSIDs found for each socket in a host 48 | """ 49 | all_fsids = [] 50 | for socket_metadata in node_metadata['ceph']['sockets'].values(): 51 | config = socket_metadata.get('config', {}) 52 | if not config: 53 | continue 54 | fsid = config.get('fsid') 55 | if not fsid: 56 | continue 57 | all_fsids.append(fsid) 58 | return all_fsids 59 | 60 | 61 | # 62 | # Warning checks 63 | # 64 | 65 | def check_colocated_running_mons_osds(host, data): 66 | code = 'WCOM1' 67 | msg = 'collocated OSDs with MONs running: %s' 68 | sockets = data['ceph']['sockets'] 69 | running_mons = [] 70 | running_osds = [] 71 | for socket_name in sockets.keys(): 72 | if "mon." in socket_name: 73 | running_mons.append(socket_name) 74 | elif "osd." in socket_name: 75 | running_osds.append(socket_name) 76 | if running_mons and running_osds: 77 | daemons = "\n %s" % ','.join(running_osds) 78 | return code, msg % daemons 79 | 80 | 81 | # 82 | # Error checks 83 | # 84 | 85 | 86 | def check_ceph_conf_exists(host, data): 87 | cluster_conf = '/etc/ceph/%s.conf' % metadata['cluster_name'] 88 | 89 | files = data['paths']['/etc/ceph']['files'].keys() 90 | if cluster_conf not in files: 91 | msg = "%s does not exist" % cluster_conf 92 | return 'ECOM1', msg 93 | 94 | 95 | def check_ceph_executable_exists(host, data): 96 | if data['ceph']['installed'] is False: 97 | return 'ECOM2', 'ceph executable was not found in common paths when running `which`' 98 | 99 | 100 | def check_var_lib_ceph_dir(host, data): 101 | code = 'ECOM3' 102 | exception = data['paths']['/var/lib/ceph']['dirs']['/var/lib/ceph']['exception'] 103 | if exception: 104 | msg = '/var/lib/ceph could not be parsed: %s' % exception['repr'] 105 | return code, msg 106 | 107 | 108 | def check_var_lib_ceph_permissions(host, data): 109 | code = 'ECOM4' 110 | group = data['paths']['/var/lib/ceph']['dirs']['/var/lib/ceph']['group'] 111 | owner = data['paths']['/var/lib/ceph']['dirs']['/var/lib/ceph']['owner'] 112 | if group == owner != 'ceph': 113 | msg = '/var/lib/ceph has invalid ownership: %s:%s, should be ceph:ceph' % (owner, group) 114 | return code, msg 115 | 116 | 117 | def check_cluster_fsid(host, data): 118 | code = 'ECOM5' 119 | msg = 'fsid "%s" is different than host(s): %s' 120 | mismatched_hosts = [] 121 | 122 | current_fsid = get_fsid(data) 123 | 124 | # no fsid exists for the current host as defined in ceph.conf, let other 125 | # checks note about this instead of reporting an empty FSID 126 | if not current_fsid: 127 | return 128 | 129 | for daemon, hosts in metadata['nodes'].items(): 130 | for host in hosts: 131 | hostname = host['host'] 132 | host_fsid = get_fsid(metadata[daemon][hostname]) 133 | if host_fsid and current_fsid != host_fsid: 134 | mismatched_hosts.append(hostname) 135 | 136 | if mismatched_hosts: 137 | return code, msg % (current_fsid, ','.join(mismatched_hosts)) 138 | 139 | 140 | def check_ceph_version_parity(host, data): 141 | code = 'ECOM6' 142 | msg = '(installed) Ceph version "%s" is different than host(s): %s' 143 | mismatched_hosts = [] 144 | host_version = data['ceph']['version'] 145 | for daemon, hosts in metadata['nodes'].items(): 146 | for host in hosts: 147 | hostname = host['host'] 148 | version = metadata[daemon][hostname]['ceph']['version'] 149 | if host_version != version: 150 | mismatched_hosts.append(hostname) 151 | 152 | if mismatched_hosts: 153 | return code, msg % (host_version, ','.join(mismatched_hosts)) 154 | 155 | 156 | def check_ceph_socket_and_installed_version_parity(host, data): 157 | code = 'ECOM7' 158 | msg = '(installed) Ceph version "%s" is different than version from running socket(s): %s' 159 | mismatched_sockets = [] 160 | host_version = data['ceph']['version'] 161 | sockets = data['ceph']['sockets'] 162 | for socket, socket_data in sockets.items(): 163 | socket_version = socket_data['version'].get('version') 164 | if socket_version and socket_version not in host_version: 165 | mismatched_sockets.append("%s:%s" % (socket, socket_version)) 166 | 167 | if mismatched_sockets: 168 | return code, msg % (host_version, ','.join(mismatched_sockets)) 169 | 170 | 171 | def check_rgw_num_rados_handles(host, data): 172 | """ 173 | Although this is an RGW setting, the way Ceph handles configurations can 174 | have this setting be different depending on the daemon. Since we are 175 | checking on every host and every socket, we are placing this check here 176 | with common checks. 177 | """ 178 | code = 'WCOM7' 179 | msg = "rgw_num_rados_handles shouldn't be larger than 1, can lead to memory leaks: %s" 180 | sockets = data['ceph']['sockets'] 181 | failed = [] 182 | for socket, socket_data in sockets.items(): 183 | config = socket_data.get('config', {}) 184 | if not config: 185 | continue 186 | rgw_num_rados_handles = config.get('rgw_num_rados_handles', 1) 187 | name = socket.split('/var/run/ceph/')[-1] 188 | rgw_num_rados_handles = str_to_int(rgw_num_rados_handles) 189 | if rgw_num_rados_handles > 1: 190 | failed.append(name) 191 | 192 | if failed: 193 | return code, msg % ','.join(failed) 194 | 195 | 196 | def check_fsid_exists(host, data): 197 | code = 'ECOM8' 198 | msg = "'fsid' is missing in the ceph configuration" 199 | 200 | current_fsid = get_fsid(data) 201 | if not current_fsid: 202 | return code, msg 203 | 204 | 205 | def check_fsid_per_daemon(host, data): 206 | """ 207 | In certain deployments types (hi rook!) the FSID will not be present in a 208 | ceph conf file - it will be passed in *directly* to the daemon as an 209 | argument. We aren't going to parse arguments, but the admin socket allows 210 | us to poke inside and check what cluster FSID the daemon is associated 211 | with. 212 | """ 213 | code = 'ECOM9' 214 | msg = 'Found cluster FSIDs from running sockets different than: %s' 215 | sockets = data['ceph']['sockets'] 216 | common_fsid = get_common_fsid() 217 | if not common_fsid: # is this even possible? 218 | return 219 | 220 | msg = msg % common_fsid 221 | sockets = data['ceph']['sockets'] 222 | failed = False 223 | for socket, socket_data in sockets.items(): 224 | config = socket_data.get('config', {}) 225 | if not config: 226 | continue 227 | socket_fsid = config.get('fsid') 228 | if not socket_fsid: 229 | continue 230 | if socket_fsid != common_fsid: 231 | name = socket.split('/var/run/ceph/')[-1] 232 | msg += '\n %s : %s' % (name, socket_fsid) 233 | failed = True 234 | if failed: 235 | return code, msg 236 | 237 | 238 | def check_multiple_running_mons(host, data): 239 | code = 'ECOM10' 240 | msg = 'multiple running mons found: %s' 241 | sockets = data['ceph']['sockets'] 242 | running_mons = [] 243 | for socket_name in sockets.keys(): 244 | if "mon." in socket_name: 245 | running_mons.append(socket_name) 246 | if len(running_mons) > 1: 247 | return code, msg % ','.join(running_mons) 248 | -------------------------------------------------------------------------------- /ceph_medic/runner.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from ceph_medic import metadata, terminal, daemon_types 3 | from ceph_medic import checks, __version__ 4 | from ceph_medic import config 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class Runner(object): 10 | 11 | def __init__(self): 12 | self.passed = 0 13 | self.skipped = 0 14 | self.total = 0 15 | self.errors = 0 16 | self.warnings = 0 17 | self.ignore = [] 18 | self.internal_errors = [] 19 | 20 | @property 21 | def total_hosts(self): 22 | # XXX does not ensure unique nodes. In collocated scenarios, a single 23 | # node that is a 'mon' and an 'osd' would count as two nodes 24 | count = 0 25 | for daemon in metadata['nodes'].values(): 26 | count += len(daemon) 27 | return count 28 | 29 | def run(self): 30 | """ 31 | Go through all the daemons, and all checks. Single entrypoint for running 32 | checks everywhere. 33 | """ 34 | start_header() 35 | for daemon_type in daemon_types: 36 | self.run_daemons(daemon_type) 37 | 38 | # these are checks that should run once per cluster 39 | nodes_header('cluster') 40 | self.run_cluster(checks.cluster) 41 | 42 | if metadata['failed_nodes']: 43 | terminal.write.bold('\n{daemon:-^30}\n'.format(daemon=' Failed Nodes ')) 44 | for host, reason in metadata['failed_nodes'].items(): 45 | terminal.loader.write(' %s' % terminal.red(host)) 46 | terminal.write.write('\n') 47 | reason_lines = reason.split('\n') 48 | main_reason = reason_lines.pop(0) 49 | terminal.write.write(" %s\n" % main_reason) 50 | for line in reason_lines: 51 | terminal.write.write(" %s\n" % line) 52 | self.total = self.errors + self.warnings + self.passed + len(self.internal_errors) 53 | return self 54 | 55 | def run_daemons(self, daemon_type): 56 | has_nodes = metadata[daemon_type] 57 | is_daemon = daemon_type in metadata['nodes'] 58 | if has_nodes and is_daemon: # we have nodes of this type to run 59 | nodes_header(daemon_type) 60 | else: 61 | return 62 | 63 | for host, data in metadata[daemon_type].items(): 64 | modules = [checks.common, getattr(checks, daemon_type, None)] 65 | self.run_host(host, data, modules) 66 | 67 | def run_cluster(self, module): 68 | # XXX get the cluster name here 69 | cluster_name = '%s cluster' % metadata.get('cluster_name', 'ceph') 70 | terminal.loader.write(' %s' % terminal.yellow(cluster_name)) 71 | has_error = False 72 | checks = collect_checks(module) 73 | for check in checks: 74 | try: 75 | # TODO: figure out how to skip running a specific check if 76 | # the code is ignored, maybe introspecting the function? 77 | result = getattr(module, check)() 78 | except Exception as error: 79 | result = None 80 | logger.exception('check had an unhandled error: %s', check) 81 | self.internal_errors.append(error) 82 | if result: 83 | code, message = result 84 | # XXX This is not ideal, we shouldn't need to get all the way here 85 | # to make sure this is actually ignored. (Or maybe it doesn't matter?) 86 | if code in self.ignore: 87 | self.skipped += 1 88 | # avoid writing anything else to the terminal, and just 89 | # go to the next check 90 | continue 91 | if not has_error: 92 | # XXX get the cluster name here 93 | terminal.loader.write(' %s' % terminal.red(cluster_name)) 94 | terminal.write.write('\n') 95 | 96 | if code.startswith('E'): 97 | code = terminal.red(code) 98 | self.errors += 1 99 | elif code.startswith('W'): 100 | code = terminal.yellow(code) 101 | self.warnings += 1 102 | terminal.write.write(" %s: %s\n" % (code, message)) 103 | has_error = True 104 | else: 105 | self.passed += 1 106 | 107 | if not has_error: 108 | terminal.loader.write(' %s\n' % terminal.green(cluster_name)) 109 | 110 | def run_host(self, host, data, modules): 111 | terminal.loader.write(' %s' % terminal.yellow(host)) 112 | has_error = False 113 | for module in modules: 114 | checks = collect_checks(module) 115 | for check in checks: 116 | try: 117 | # TODO: figure out how to skip running a specific check if 118 | # the code is ignored, maybe introspecting the function? 119 | result = getattr(module, check)(host, data) 120 | except Exception as error: 121 | result = None 122 | logger.exception('check had an unhandled error: %s', check) 123 | self.internal_errors.append(error) 124 | if result: 125 | code, message = result 126 | # XXX This is not ideal, we shouldn't need to get all the way here 127 | # to make sure this is actually ignored. (Or maybe it doesn't matter?) 128 | if code in self.ignore: 129 | self.skipped += 1 130 | # avoid writing anything else to the terminal, and just 131 | # go to the next check 132 | continue 133 | if not has_error: 134 | terminal.loader.write(' %s' % terminal.red(host)) 135 | terminal.write.write('\n') 136 | 137 | if code.startswith('E'): 138 | self.errors += 1 139 | code = terminal.red(code) 140 | elif code.startswith('W'): 141 | self.warnings += 1 142 | code = terminal.yellow(code) 143 | terminal.write.write(" %s: %s\n" % (code, message)) 144 | has_error = True 145 | else: 146 | self.passed += 1 147 | 148 | if not has_error: 149 | terminal.loader.write(' %s\n' % terminal.green(host)) 150 | 151 | 152 | run_errors = terminal.yellow(""" 153 | While running checks, ceph-medic had %s unhandled errors, please look at the 154 | configured log file and report the issue along with the traceback. 155 | """) 156 | 157 | 158 | def report(results): 159 | msg = "\n{passed}{error}{warning}{skipped}{internal_errors}{hosts}" 160 | 161 | if results.errors: 162 | msg = terminal.red(msg) 163 | elif results.warnings: 164 | msg = terminal.yellow(msg) 165 | else: 166 | msg = terminal.green(msg) 167 | 168 | errors = warnings = internal_errors = '' 169 | 170 | if results.errors: 171 | errors = '%s errors, ' % results.errors if results.errors > 1 else '1 error, ' 172 | if results.warnings: 173 | warnings = '%s warnings, ' % results.warnings if results.warnings > 1 else '1 warning, ' 174 | if results.internal_errors: 175 | internal_errors = "%s internal errors, " % len(results.internal_errors) 176 | 177 | terminal.write.raw( 178 | msg.format( 179 | passed="%s passed, " % results.passed, 180 | error=errors, 181 | warning=warnings, 182 | skipped="%s skipped, " % results.skipped if results.skipped else '', 183 | internal_errors=internal_errors, 184 | hosts="on %s hosts" % results.total_hosts 185 | ) 186 | ) 187 | if results.internal_errors: 188 | terminal.write.raw(run_errors % len(results.internal_errors)) 189 | 190 | 191 | start_header_tmpl = """ 192 | {title:=^80} 193 | Version: {version: >4} Cluster Name: "{cluster_name}" 194 | Connection: {connection_type} 195 | Total hosts: [{total_hosts}] 196 | OSDs: {osds: >4} MONs: {mons: >4} Clients: {clients: >4} 197 | MDSs: {mdss: >4} RGWs: {rgws: >4} MGRs: {mgrs: >7} 198 | """ 199 | 200 | 201 | def start_header(): 202 | connection_type = config.file.get_safe('global', 'deployment_type', 'ssh') 203 | daemon_totals = dict((daemon, 0) for daemon in daemon_types) 204 | total_hosts = 0 205 | for daemon in daemon_types: 206 | count = len(metadata[daemon].keys()) 207 | total_hosts += count 208 | daemon_totals[daemon] = count 209 | terminal.write.raw(start_header_tmpl.format( 210 | title=' Starting remote check session ', 211 | version=__version__, 212 | connection_type=connection_type, 213 | total_hosts=total_hosts, 214 | cluster_name=metadata['cluster_name'], 215 | **daemon_totals)) 216 | terminal.write.raw('=' * 80) 217 | 218 | 219 | def nodes_header(daemon_type): 220 | readable_daemons = { 221 | 'rgws': ' rados gateways ', 222 | 'mgrs': ' managers ', 223 | 'mons': ' mons ', 224 | 'osds': ' osds ', 225 | 'clients': ' clients ', 226 | 'cluster': ' cluster ', 227 | } 228 | 229 | terminal.write.bold('\n{daemon:-^30}\n'.format( 230 | daemon=readable_daemons.get(daemon_type, daemon_type))) 231 | 232 | 233 | def collect_checks(module): 234 | checks = [i for i in dir(module) if i.startswith('check')] 235 | return checks 236 | -------------------------------------------------------------------------------- /ceph_medic/collector.py: -------------------------------------------------------------------------------- 1 | """ 2 | Collect remote information on Ceph daemons, store everything in memory and make 3 | it available as a global part of the module so that other checks can consume it 4 | """ 5 | from ceph_medic import metadata, remote, terminal 6 | from ceph_medic.terminal import loader 7 | from ceph_medic.connection import get_connection 8 | from execnet.gateway_bootstrap import HostNotFound 9 | import logging 10 | 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | def collect_paths(conn): 16 | """ 17 | Gather all the interesting paths from the remote system, stat them, and 18 | capture contents when needed. 19 | 20 | Generates a tree path, using the "path of interest" as key, and appending 21 | the absolute paths of files in the 'files' key and directories in the 22 | 'dirs' key. A small subset of a tree would look 23 | very similar to:: 24 | 25 | { 26 | '/etc/ceph': { 27 | 'dirs': { 28 | '/etc/ceph/ceph.d': {...}, 29 | }, 30 | 'files': { 31 | '/etc/ceph/ceph.d/ceph.conf': {...}, 32 | }, 33 | } 34 | } 35 | 36 | Each file and dir in a path tree will contain a set of keys populated 37 | mostly by calling ``stat`` on the remote system for that absolute path, in 38 | addition to capturing contents when "interesting files" are dfined. For 39 | example, the contents of a ``ceph.conf`` file will always be captured. This 40 | is how that file would look like in a tree path:: 41 | 42 | 43 | { 44 | '/etc/ceph/ceph.d/test.conf': 45 | { 46 | 'contents': '[osd]\nosd mkfs type = xfs\nosd mkfs options[...] ', 47 | 'exception': {}, 48 | 'group': 'ceph', 49 | 'n_fields': 16, 50 | 'n_sequence_fields': 10, 51 | 'n_unnamed_fields': 3, 52 | 'owner': 'ceph', 53 | 'st_atime': 1492721509.572292, 54 | 'st_blksize': 4096, 55 | 'st_blocks': 8, 56 | 'st_ctime': 1492721507.880156, 57 | 'st_dev': 64768L, 58 | 'st_gid': 167, 59 | 'st_ino': 100704475, 60 | 'st_mode': 33188, 61 | 'st_mtime': 1492721506.1060133, 62 | 'st_nlink': 1, 63 | 'st_rdev': 0, 64 | 'st_size': 650, 65 | 'st_uid': 167 66 | }, 67 | 68 | } 69 | 70 | .. note:: ``contents`` is captured using ``file.read()`` so its value will 71 | be a single line with possible line breaks (if any). For reading and 72 | parsing that key on each line a split must be done on the line break. 73 | 74 | """ 75 | path_metadata = {} 76 | paths = { 77 | "/etc/ceph": {'get_contents': True}, 78 | "/var/lib/ceph": { 79 | 'get_contents': True, 80 | 'skip_files': ['activate.monmap', 'superblock'], 81 | 'skip_dirs': ['tmp', 'current', 'store.db'] 82 | }, 83 | "/var/run/ceph": {'get_contents': False}, 84 | } 85 | for p, kw in paths.items(): 86 | # Collect metadata about the files and dirs for the given path and assign 87 | # it back to the path_metadata for the current node 88 | path_metadata[p] = get_path_metadata(conn, p, **kw) 89 | return path_metadata 90 | 91 | 92 | def get_path_metadata(conn, path, **kw): 93 | # generate the tree 94 | tree = conn.remote_module.path_tree( 95 | path, 96 | kw.get('skip_dirs'), 97 | kw.get('skip_files'), 98 | kw.get('get_contents') 99 | ) 100 | 101 | files = {} 102 | dirs = {} 103 | 104 | for i in tree['files']: 105 | files[i] = conn.remote_module.stat_path(i, None, None, kw.get('get_contents')) 106 | for i in tree['dirs']: 107 | dirs[i] = conn.remote_module.stat_path(i, None, None, False) 108 | 109 | # actual root path 110 | dirs[path] = conn.remote_module.stat_path(path, None, None, False) 111 | 112 | return {'dirs': dirs, 'files': files} 113 | 114 | 115 | def get_node_metadata(conn, hostname, cluster_nodes): 116 | # "import" the remote functions so that remote calls using the 117 | # functions can be executed 118 | conn.import_module(remote.functions) 119 | 120 | node_metadata = {'ceph': {}} 121 | 122 | # collect paths and files first 123 | loader.write('Host: %-*s collecting: [%s]' % (40, hostname, terminal.yellow('paths'))) 124 | node_metadata['paths'] = collect_paths(conn) 125 | loader.write('Host: %-*s collecting: [%s]' % (40, hostname, terminal.green('paths'))) 126 | 127 | # TODO: collect network information, passing all the cluster_nodes 128 | # so that it can check for inter-node connectivity 129 | loader.write('Host: %-*s collecting: [%s]' % (40, hostname, terminal.yellow('network'))) 130 | node_metadata['network'] = collect_network(cluster_nodes) 131 | loader.write('Host: %-*s collecting: [%s]' % (40, hostname, terminal.green('network'))) 132 | 133 | # TODO: collect device information 134 | loader.write('Host: %-*s collecting: [%s]' % (40, hostname, terminal.yellow('devices'))) 135 | node_metadata['devices'] = collect_devices() 136 | loader.write('Host: %-*s collecting: [%s]' % (40, hostname, terminal.green('devices'))) 137 | 138 | # collect ceph information 139 | loader.write('Host: %-*s collecting: [%s]' % (40, hostname, terminal.yellow('ceph information'))) 140 | node_metadata['ceph'] = collect_ceph_info(conn) 141 | node_metadata['ceph']['sockets'] = collect_socket_info(conn, node_metadata) 142 | node_metadata['ceph']['osd'] = collect_ceph_osd_info(conn) 143 | loader.write('Host: %-*s collecting: [%s]' % (40, hostname, terminal.green('ceph information'))) 144 | 145 | return node_metadata 146 | 147 | 148 | def collect(): 149 | """ 150 | The main collecting entrypoint. This function will call all the pieces 151 | needed to build the complete metadata set of a remote system so that checks 152 | can consume and verify that data. 153 | 154 | After collection is done, the full contents of the metadata are available 155 | at ``ceph_medic.metadata`` 156 | """ 157 | cluster_nodes = metadata['nodes'] 158 | loader.write('collecting remote node information') 159 | total_nodes = 0 160 | failed_nodes = 0 161 | has_cluster_data = False 162 | 163 | for node_type, nodes in cluster_nodes.items(): 164 | for node in nodes: 165 | # check if a node type exists for this node before doing any work: 166 | try: 167 | metadata[node_type] 168 | except KeyError: 169 | msg = "Skipping node {} from unknown host group: {}".format(node, node_type) 170 | logger.warning(msg) 171 | continue 172 | 173 | total_nodes += 1 174 | hostname = node['host'] 175 | loader.write('Host: %-40s connection: [%-20s]' % (hostname, terminal.yellow('connecting'))) 176 | # TODO: make sure that the hostname is resolvable, trying to 177 | # debug SSH issues with execnet is pretty hard/impossible, use 178 | # util.net.host_is_resolvable 179 | try: 180 | logger.debug('attempting connection to host: %s', node['host']) 181 | conn = get_connection(node['host'], container=node.get('container')) 182 | loader.write('Host: %-40s connection: [%-20s]' % (hostname, terminal.green('connected'))) 183 | loader.write('\n') 184 | except HostNotFound as err: 185 | logger.exception('connection failed') 186 | loader.write('Host: %-40s connection: [%-20s]' % (hostname, terminal.red('failed'))) 187 | loader.write('\n') 188 | failed_nodes += 1 189 | if metadata[node_type].get(hostname): 190 | metadata[node_type].pop(hostname) 191 | metadata['nodes'][node_type] = [i for i in metadata['nodes'][node_type] if i['host'] != hostname] 192 | metadata['failed_nodes'].update({hostname: str(err)}) 193 | continue 194 | 195 | # send the full node metadata for global scope so that the checks 196 | # can consume this 197 | metadata[node_type][hostname] = get_node_metadata(conn, hostname, cluster_nodes) 198 | if node_type == 'mons': # if node type is monitor, admin privileges are most likely authorized 199 | if not has_cluster_data: 200 | cluster_data = collect_cluster(conn) 201 | if cluster_data: 202 | metadata['cluster'] = cluster_data 203 | has_cluster_data = True 204 | conn.exit() 205 | 206 | if failed_nodes == total_nodes: 207 | loader.write(terminal.red('Collection failed!') + ' ' *70) 208 | # TODO: this helps clear out the 'loader' line so that the error looks 209 | # clean, but this manual clearing should be done automatically 210 | terminal.write.raw('') 211 | raise RuntimeError('All nodes failed to connect. Cannot run any checks') 212 | if failed_nodes: 213 | loader.write(terminal.yellow('Collection completed with some failed connections' + ' ' *70 + '\n')) 214 | else: 215 | loader.write('Collection completed!' + ' ' *70 + '\n') 216 | 217 | 218 | # Network 219 | # 220 | def collect_network(cluster_nodes): 221 | """ 222 | Collect node-specific information, but also try to check connectivity to 223 | other hosts that are passed in as ``cluster_nodes`` 224 | """ 225 | return {} 226 | 227 | 228 | # Devices 229 | # 230 | def collect_devices(): 231 | """ 232 | Get all the device information from the current node 233 | """ 234 | return {} 235 | 236 | 237 | # Ceph 238 | # 239 | def collect_ceph_info(conn): 240 | result = dict() 241 | result['version'] = remote.commands.ceph_version(conn) 242 | result['installed'] = remote.commands.ceph_is_installed(conn) 243 | return result 244 | 245 | 246 | def collect_cluster(conn): 247 | """ 248 | Captures useful cluster information like the status 249 | """ 250 | result = dict() 251 | result['status'] = remote.commands.ceph_status(conn) 252 | return result 253 | 254 | 255 | # Ceph socket info 256 | # 257 | def collect_socket_info(conn, node_metadata): 258 | sockets = [socket for socket in node_metadata['paths']['/var/run/ceph']['files'] 259 | if socket.endswith(".asok")] 260 | result = dict() 261 | for socket in sockets: 262 | result[socket] = {'version': {}, 'config': {}} 263 | result[socket]['version'] = remote.commands.ceph_socket_version(conn, socket) 264 | result[socket]['config'] = remote.commands.daemon_socket_config(conn, socket) 265 | return result 266 | 267 | 268 | # Ceph OSD info 269 | # 270 | def collect_ceph_osd_info(conn): 271 | result = {'dump': {}} 272 | result['dump'] = remote.commands.ceph_osd_dump(conn) 273 | return result 274 | -------------------------------------------------------------------------------- /tests/functional/Vagrantfile: -------------------------------------------------------------------------------- 1 | # -*- mode: ruby -*- 2 | # vi: set ft=ruby : 3 | 4 | require 'yaml' 5 | require 'time' 6 | VAGRANTFILE_API_VERSION = '2' 7 | 8 | DEBUG = false 9 | 10 | config_file=File.expand_path(File.join(File.dirname(__FILE__), 'vagrant_variables.yml')) 11 | settings=YAML.load_file(config_file) 12 | 13 | LABEL_PREFIX = settings['label_prefix'] ? settings['label_prefix'] + "-" : "" 14 | NMONS = settings['mon_vms'] 15 | NOSDS = settings['osd_vms'] 16 | NMDSS = settings['mds_vms'] 17 | NRGWS = settings['rgw_vms'] 18 | NNFSS = settings['nfs_vms'] 19 | RESTAPI = settings['restapi'] 20 | NRBD_MIRRORS = settings['rbd_mirror_vms'] 21 | CLIENTS = settings['client_vms'] 22 | NISCSI_GWS = settings['iscsi_gw_vms'] 23 | PUBLIC_SUBNET = settings['public_subnet'] 24 | CLUSTER_SUBNET = settings['cluster_subnet'] 25 | BOX = settings['vagrant_box'] 26 | CLIENT_BOX = settings['client_vagrant_box'] 27 | BOX_URL = settings['vagrant_box_url'] 28 | SYNC_DIR = settings['vagrant_sync_dir'] 29 | MEMORY = settings['memory'] 30 | ETH = settings['eth'] 31 | USER = settings['ssh_username'] 32 | 33 | ASSIGN_STATIC_IP = settings.fetch('assign_static_ip', true) 34 | DISABLE_SYNCED_FOLDER = settings.fetch('vagrant_disable_synced_folder', false) 35 | DISK_UUID = Time.now.utc.to_i 36 | 37 | def create_vmdk(name, size) 38 | dir = Pathname.new(__FILE__).expand_path.dirname 39 | path = File.join(dir, '.vagrant', name + '.vmdk') 40 | `vmware-vdiskmanager -c -s #{size} -t 0 -a scsi #{path} \ 41 | 2>&1 > /dev/null` unless File.exist?(path) 42 | end 43 | 44 | Vagrant.configure(VAGRANTFILE_API_VERSION) do |config| 45 | config.ssh.insert_key = false # workaround for https://github.com/mitchellh/vagrant/issues/5048 46 | config.ssh.private_key_path = settings['ssh_private_key_path'] 47 | config.ssh.username = USER 48 | 49 | # When using libvirt, avoid errors like: 50 | # "CPU feature cmt not found" 51 | config.vm.provider :libvirt do |lv| 52 | lv.cpu_mode = 'host-passthrough' 53 | end 54 | 55 | # Faster bootup. Disables mounting the sync folder for libvirt and virtualbox 56 | if DISABLE_SYNCED_FOLDER 57 | config.vm.provider :virtualbox do |v,override| 58 | override.vm.synced_folder '.', SYNC_DIR, disabled: true 59 | end 60 | config.vm.provider :libvirt do |v,override| 61 | override.vm.synced_folder '.', SYNC_DIR, disabled: true 62 | end 63 | end 64 | 65 | (0..CLIENTS - 1).each do |i| 66 | config.vm.define "#{LABEL_PREFIX}client#{i}" do |client| 67 | client.vm.box = CLIENT_BOX 68 | client.vm.hostname = "#{LABEL_PREFIX}ceph-client#{i}" 69 | if ASSIGN_STATIC_IP 70 | client.vm.network :private_network, 71 | ip: "#{PUBLIC_SUBNET}.4#{i}" 72 | end 73 | # Virtualbox 74 | client.vm.provider :virtualbox do |vb| 75 | vb.customize ['modifyvm', :id, '--memory', "#{MEMORY}"] 76 | end 77 | 78 | # VMware 79 | client.vm.provider :vmware_fusion do |v| 80 | v.vmx['memsize'] = "#{MEMORY}" 81 | end 82 | 83 | # Libvirt 84 | client.vm.provider :libvirt do |lv| 85 | lv.memory = MEMORY 86 | lv.random_hostname = true 87 | end 88 | 89 | # Parallels 90 | client.vm.provider "parallels" do |prl| 91 | prl.name = "ceph-client#{i}" 92 | prl.memory = "#{MEMORY}" 93 | end 94 | 95 | client.vm.provider :linode do |provider| 96 | provider.label = client.vm.hostname 97 | end 98 | end 99 | end 100 | 101 | (0..NRGWS - 1).each do |i| 102 | config.vm.define "#{LABEL_PREFIX}rgw#{i}" do |rgw| 103 | rgw.vm.box = BOX 104 | rgw.vm.box_url = BOX_URL 105 | rgw.vm.hostname = "#{LABEL_PREFIX}ceph-rgw#{i}" 106 | if ASSIGN_STATIC_IP 107 | rgw.vm.network :private_network, 108 | ip: "#{PUBLIC_SUBNET}.5#{i}" 109 | end 110 | 111 | # Virtualbox 112 | rgw.vm.provider :virtualbox do |vb| 113 | vb.customize ['modifyvm', :id, '--memory', "#{MEMORY}"] 114 | end 115 | 116 | # VMware 117 | rgw.vm.provider :vmware_fusion do |v| 118 | v.vmx['memsize'] = "#{MEMORY}" 119 | end 120 | 121 | # Libvirt 122 | rgw.vm.provider :libvirt do |lv| 123 | lv.memory = MEMORY 124 | lv.random_hostname = true 125 | end 126 | 127 | # Parallels 128 | rgw.vm.provider "parallels" do |prl| 129 | prl.name = "ceph-rgw#{i}" 130 | prl.memory = "#{MEMORY}" 131 | end 132 | 133 | rgw.vm.provider :linode do |provider| 134 | provider.label = rgw.vm.hostname 135 | end 136 | end 137 | end 138 | 139 | (0..NNFSS - 1).each do |i| 140 | config.vm.define "nfs#{i}" do |nfs| 141 | nfs.vm.box = BOX 142 | nfs.vm.box_url = BOX_URL 143 | nfs.vm.hostname = "ceph-nfs#{i}" 144 | if ASSIGN_STATIC_IP 145 | nfs.vm.network :private_network, 146 | ip: "#{PUBLIC_SUBNET}.6#{i}" 147 | end 148 | 149 | # Virtualbox 150 | nfs.vm.provider :virtualbox do |vb| 151 | vb.customize ['modifyvm', :id, '--memory', "#{MEMORY}"] 152 | end 153 | 154 | # VMware 155 | nfs.vm.provider :vmware_fusion do |v| 156 | v.vmx['memsize'] = "#{MEMORY}" 157 | end 158 | 159 | # Libvirt 160 | nfs.vm.provider :libvirt do |lv| 161 | lv.memory = MEMORY 162 | lv.random_hostname = true 163 | end 164 | 165 | # Parallels 166 | nfs.vm.provider "parallels" do |prl| 167 | prl.name = "ceph-nfs#{i}" 168 | prl.memory = "#{MEMORY}" 169 | end 170 | 171 | nfs.vm.provider :linode do |provider| 172 | provider.label = nfs.vm.hostname 173 | end 174 | end 175 | end 176 | 177 | (0..NMDSS - 1).each do |i| 178 | config.vm.define "#{LABEL_PREFIX}mds#{i}" do |mds| 179 | mds.vm.box = BOX 180 | mds.vm.box_url = BOX_URL 181 | mds.vm.hostname = "#{LABEL_PREFIX}ceph-mds#{i}" 182 | if ASSIGN_STATIC_IP 183 | mds.vm.network :private_network, 184 | ip: "#{PUBLIC_SUBNET}.7#{i}" 185 | end 186 | # Virtualbox 187 | mds.vm.provider :virtualbox do |vb| 188 | vb.customize ['modifyvm', :id, '--memory', "#{MEMORY}"] 189 | end 190 | 191 | # VMware 192 | mds.vm.provider :vmware_fusion do |v| 193 | v.vmx['memsize'] = "#{MEMORY}" 194 | end 195 | 196 | # Libvirt 197 | mds.vm.provider :libvirt do |lv| 198 | lv.memory = MEMORY 199 | lv.random_hostname = true 200 | end 201 | # Parallels 202 | mds.vm.provider "parallels" do |prl| 203 | prl.name = "ceph-mds#{i}" 204 | prl.memory = "#{MEMORY}" 205 | end 206 | 207 | mds.vm.provider :linode do |provider| 208 | provider.label = mds.vm.hostname 209 | end 210 | end 211 | end 212 | 213 | (0..NRBD_MIRRORS - 1).each do |i| 214 | config.vm.define "#{LABEL_PREFIX}rbd_mirror#{i}" do |rbd_mirror| 215 | rbd_mirror.vm.box = BOX 216 | rbd_mirror.vm.box_url = BOX_URL 217 | rbd_mirror.vm.hostname = "#{LABEL_PREFIX}ceph-rbd-mirror#{i}" 218 | if ASSIGN_STATIC_IP 219 | rbd_mirror.vm.network :private_network, 220 | ip: "#{PUBLIC_SUBNET}.8#{i}" 221 | end 222 | # Virtualbox 223 | rbd_mirror.vm.provider :virtualbox do |vb| 224 | vb.customize ['modifyvm', :id, '--memory', "#{MEMORY}"] 225 | end 226 | 227 | # VMware 228 | rbd_mirror.vm.provider :vmware_fusion do |v| 229 | v.vmx['memsize'] = "#{MEMORY}" 230 | end 231 | 232 | # Libvirt 233 | rbd_mirror.vm.provider :libvirt do |lv| 234 | lv.memory = MEMORY 235 | lv.random_hostname = true 236 | end 237 | # Parallels 238 | rbd_mirror.vm.provider "parallels" do |prl| 239 | prl.name = "ceph-rbd-mirror#{i}" 240 | prl.memory = "#{MEMORY}" 241 | end 242 | 243 | rbd_mirror.vm.provider :linode do |provider| 244 | provider.label = rbd_mirror.vm.hostname 245 | end 246 | end 247 | end 248 | 249 | (0..NISCSI_GWS - 1).each do |i| 250 | config.vm.define "#{LABEL_PREFIX}iscsi_gw#{i}" do |iscsi_gw| 251 | iscsi_gw.vm.box = BOX 252 | iscsi_gw.vm.box_url = BOX_URL 253 | iscsi_gw.vm.hostname = "#{LABEL_PREFIX}ceph-iscsi-gw#{i}" 254 | if ASSIGN_STATIC_IP 255 | iscsi_gw.vm.network :private_network, 256 | ip: "#{PUBLIC_SUBNET}.9#{i}" 257 | end 258 | # Virtualbox 259 | iscsi_gw.vm.provider :virtualbox do |vb| 260 | vb.customize ['modifyvm', :id, '--memory', "#{MEMORY}"] 261 | end 262 | 263 | # VMware 264 | iscsi_gw.vm.provider :vmware_fusion do |v| 265 | v.vmx['memsize'] = "#{MEMORY}" 266 | end 267 | 268 | # Libvirt 269 | iscsi_gw.vm.provider :libvirt do |lv| 270 | lv.memory = MEMORY 271 | lv.random_hostname = true 272 | end 273 | # Parallels 274 | iscsi_gw.vm.provider "parallels" do |prl| 275 | prl.name = "ceph-iscsi-gw#{i}" 276 | prl.memory = "#{MEMORY}" 277 | end 278 | 279 | iscsi_gw.vm.provider :linode do |provider| 280 | provider.label = iscsi_gw.vm.hostname 281 | end 282 | end 283 | end 284 | 285 | (0..NMONS - 1).each do |i| 286 | config.vm.define "#{LABEL_PREFIX}mon#{i}" do |mon| 287 | mon.vm.box = BOX 288 | mon.vm.box_url = BOX_URL 289 | mon.vm.hostname = "#{LABEL_PREFIX}ceph-mon#{i}" 290 | if ASSIGN_STATIC_IP 291 | mon.vm.network :private_network, 292 | ip: "#{PUBLIC_SUBNET}.1#{i}" 293 | end 294 | # Virtualbox 295 | mon.vm.provider :virtualbox do |vb| 296 | vb.customize ['modifyvm', :id, '--memory', "#{MEMORY}"] 297 | end 298 | 299 | # VMware 300 | mon.vm.provider :vmware_fusion do |v| 301 | v.vmx['memsize'] = "#{MEMORY}" 302 | end 303 | 304 | # Libvirt 305 | mon.vm.provider :libvirt do |lv| 306 | lv.memory = MEMORY 307 | lv.random_hostname = true 308 | end 309 | 310 | # Parallels 311 | mon.vm.provider "parallels" do |prl| 312 | prl.name = "ceph-mon#{i}" 313 | prl.memory = "#{MEMORY}" 314 | end 315 | 316 | mon.vm.provider :linode do |provider| 317 | provider.label = mon.vm.hostname 318 | end 319 | end 320 | end 321 | 322 | (0..NOSDS - 1).each do |i| 323 | config.vm.define "#{LABEL_PREFIX}osd#{i}" do |osd| 324 | osd.vm.box = BOX 325 | osd.vm.box_url = BOX_URL 326 | osd.vm.hostname = "#{LABEL_PREFIX}ceph-osd#{i}" 327 | if ASSIGN_STATIC_IP 328 | osd.vm.network :private_network, 329 | ip: "#{PUBLIC_SUBNET}.10#{i}" 330 | osd.vm.network :private_network, 331 | ip: "#{CLUSTER_SUBNET}.20#{i}" 332 | end 333 | # Virtualbox 334 | osd.vm.provider :virtualbox do |vb| 335 | # Create our own controller for consistency and to remove VM dependency 336 | vb.customize ['storagectl', :id, 337 | '--name', 'OSD Controller', 338 | '--add', 'scsi'] 339 | (0..2).each do |d| 340 | vb.customize ['createhd', 341 | '--filename', "disk-#{i}-#{d}", 342 | '--size', '11000'] unless File.exist?("disk-#{i}-#{d}.vdi") 343 | vb.customize ['storageattach', :id, 344 | '--storagectl', 'OSD Controller', 345 | '--port', 3 + d, 346 | '--device', 0, 347 | '--type', 'hdd', 348 | '--medium', "disk-#{i}-#{d}.vdi"] 349 | end 350 | vb.customize ['modifyvm', :id, '--memory', "#{MEMORY}"] 351 | end 352 | 353 | # VMware 354 | osd.vm.provider :vmware_fusion do |v| 355 | (0..1).each do |d| 356 | v.vmx["scsi0:#{d + 1}.present"] = 'TRUE' 357 | v.vmx["scsi0:#{d + 1}.fileName"] = 358 | create_vmdk("disk-#{i}-#{d}", '11000MB') 359 | end 360 | v.vmx['memsize'] = "#{MEMORY}" 361 | end 362 | 363 | # Libvirt 364 | driverletters = ('a'..'z').to_a 365 | osd.vm.provider :libvirt do |lv| 366 | # always make /dev/sd{a/b/c/d} so that CI can ensure that 367 | # virtualbox and libvirt will have the same devices to use for OSDs 368 | (0..3).each do |d| 369 | lv.storage :file, :device => "hd#{driverletters[d]}", :path => "disk-#{i}-#{d}-#{DISK_UUID}.disk", :size => '12G', :bus => "ide" 370 | end 371 | lv.memory = MEMORY 372 | lv.random_hostname = true 373 | end 374 | 375 | # Parallels 376 | osd.vm.provider "parallels" do |prl| 377 | prl.name = "ceph-osd#{i}" 378 | prl.memory = "#{MEMORY}" 379 | (0..1).each do |d| 380 | prl.customize ["set", :id, 381 | "--device-add", 382 | "hdd", 383 | "--iface", 384 | "sata"] 385 | end 386 | end 387 | 388 | osd.vm.provider :linode do |provider| 389 | provider.label = osd.vm.hostname 390 | end 391 | 392 | end 393 | end 394 | end 395 | --------------------------------------------------------------------------------