From a62594a2188343101d1c0ffa4588745d4a2044b4 Mon Sep 17 00:00:00 2001 From: juanvallejo Date: Fri, 10 Mar 2017 19:14:01 -0500 Subject: add etcd volume check --- .../openshift_checks/etcd_volume.py | 58 ++++++++ .../test/etcd_volume_test.py | 149 +++++++++++++++++++++ 2 files changed, 207 insertions(+) create mode 100644 roles/openshift_health_checker/openshift_checks/etcd_volume.py create mode 100644 roles/openshift_health_checker/test/etcd_volume_test.py diff --git a/roles/openshift_health_checker/openshift_checks/etcd_volume.py b/roles/openshift_health_checker/openshift_checks/etcd_volume.py new file mode 100644 index 000000000..cbdf70092 --- /dev/null +++ b/roles/openshift_health_checker/openshift_checks/etcd_volume.py @@ -0,0 +1,58 @@ +# vim: expandtab:tabstop=4:shiftwidth=4 +""" +Ansible module for warning about etcd volume size past a defined threshold. +""" + +from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var + + +class EtcdVolume(OpenShiftCheck): + """Ensure disk size for an etcd host does not exceed a defined limit""" + + name = "etcd_volume" + tags = ["etcd", "health"] + + etcd_default_size_limit_percent = 0.9 + + def run(self, tmp, task_vars): + ansible_mounts = get_var(task_vars, "ansible_mounts") + + etcd_mount_path = self._get_etcd_mount_path(ansible_mounts) + etcd_disk_size_available = int(etcd_mount_path["size_available"]) + etcd_disk_size_total = int(etcd_mount_path["size_total"]) + etcd_disk_size_used = etcd_disk_size_total - etcd_disk_size_available + + size_limit_percent = get_var( + task_vars, + "etcd_disk_size_limit_percent", + default=self.etcd_default_size_limit_percent + ) + + if float(etcd_disk_size_used) / float(etcd_disk_size_total) > size_limit_percent: + msg = ("Current etcd volume usage ({actual:.2f} GB) for the volume \"{volume}\" " + "is greater than the storage limit ({limit:.2f} GB).") + msg = msg.format( + actual=self._to_gigabytes(etcd_disk_size_used), + volume=etcd_mount_path["mount"], + limit=self._to_gigabytes(size_limit_percent * etcd_disk_size_total), + ) + return {"failed": True, "msg": msg} + + return {"changed": False} + + @staticmethod + def _get_etcd_mount_path(ansible_mounts): + supported_mnt_paths = ["/var/lib/etcd", "/var/lib", "/var", "/"] + available_mnts = {mnt.get("mount"): mnt for mnt in ansible_mounts} + + for path in supported_mnt_paths: + if path in available_mnts: + return available_mnts[path] + + paths = ', '.join(sorted(available_mnts)) or 'none' + msg = "Unable to determine available disk space. Paths mounted: {}.".format(paths) + raise OpenShiftCheckException(msg) + + @staticmethod + def _to_gigabytes(byte_size): + return float(byte_size) / 10.0**9 diff --git a/roles/openshift_health_checker/test/etcd_volume_test.py b/roles/openshift_health_checker/test/etcd_volume_test.py new file mode 100644 index 000000000..ff8d0d8d7 --- /dev/null +++ b/roles/openshift_health_checker/test/etcd_volume_test.py @@ -0,0 +1,149 @@ +import pytest + +from openshift_checks.etcd_volume import EtcdVolume, OpenShiftCheckException + + +@pytest.mark.parametrize('ansible_mounts,extra_words', [ + ([], ['none']), # empty ansible_mounts + ([{'mount': '/mnt'}], ['/mnt']), # missing relevant mount paths +]) +def test_cannot_determine_available_disk(ansible_mounts, extra_words): + task_vars = dict( + ansible_mounts=ansible_mounts, + ) + check = EtcdVolume(execute_module=fake_execute_module) + + with pytest.raises(OpenShiftCheckException) as excinfo: + check.run(tmp=None, task_vars=task_vars) + + for word in 'determine available disk'.split() + extra_words: + assert word in str(excinfo.value) + + +@pytest.mark.parametrize('size_limit,ansible_mounts', [ + ( + # if no size limit is specified, expect max usage + # limit to default to 90% of size_total + None, + [{ + 'mount': '/', + 'size_available': 40 * 10**9, + 'size_total': 80 * 10**9 + }], + ), + ( + 1, + [{ + 'mount': '/', + 'size_available': 30 * 10**9, + 'size_total': 30 * 10**9, + }], + ), + ( + 20000000000, + [{ + 'mount': '/', + 'size_available': 20 * 10**9, + 'size_total': 40 * 10**9, + }], + ), + ( + 5000000000, + [{ + # not enough space on / ... + 'mount': '/', + 'size_available': 0, + 'size_total': 0, + }, { + # not enough space on /var/lib ... + 'mount': '/var/lib', + 'size_available': 2 * 10**9, + 'size_total': 21 * 10**9, + }, { + # ... but enough on /var/lib/etcd + 'mount': '/var/lib/etcd', + 'size_available': 36 * 10**9, + 'size_total': 40 * 10**9 + }], + ) +]) +def test_succeeds_with_recommended_disk_space(size_limit, ansible_mounts): + task_vars = dict( + etcd_disk_size_limit_percent=size_limit, + ansible_mounts=ansible_mounts, + ) + + if task_vars["etcd_disk_size_limit_percent"] is None: + task_vars.pop("etcd_disk_size_limit_percent") + + check = EtcdVolume(execute_module=fake_execute_module) + result = check.run(tmp=None, task_vars=task_vars) + + assert not result.get('failed', False) + + +@pytest.mark.parametrize('size_limit_percent,ansible_mounts,extra_words', [ + ( + # if no size limit is specified, expect max usage + # limit to default to 90% of size_total + None, + [{ + 'mount': '/', + 'size_available': 1 * 10**9, + 'size_total': 100 * 10**9, + }], + ['90.00 GB'], + ), + ( + 0.7, + [{ + 'mount': '/', + 'size_available': 1 * 10**6, + 'size_total': 5 * 10**9, + }], + ['3.50 GB'], + ), + ( + 0.4, + [{ + 'mount': '/', + 'size_available': 2 * 10**9, + 'size_total': 6 * 10**9, + }], + ['2.40 GB'], + ), + ( + None, + [{ + # enough space on /var ... + 'mount': '/var', + 'size_available': 20 * 10**9, + 'size_total': 20 * 10**9, + }, { + # .. but not enough on /var/lib + 'mount': '/var/lib', + 'size_available': 1 * 10**9, + 'size_total': 20 * 10**9, + }], + ['18.00 GB'], + ), +]) +def test_fails_with_insufficient_disk_space(size_limit_percent, ansible_mounts, extra_words): + task_vars = dict( + etcd_disk_size_limit_percent=size_limit_percent, + ansible_mounts=ansible_mounts, + ) + + if task_vars["etcd_disk_size_limit_percent"] is None: + task_vars.pop("etcd_disk_size_limit_percent") + + check = EtcdVolume(execute_module=fake_execute_module) + result = check.run(tmp=None, task_vars=task_vars) + + assert result['failed'] + for word in extra_words: + assert word in result['msg'] + + +def fake_execute_module(*args): + raise AssertionError('this function should not be called') -- cgit v1.2.3 From 8f52b334d618a489c0b2388303f5fa5132a59e2d Mon Sep 17 00:00:00 2001 From: Rodolfo Carvalho Date: Wed, 10 May 2017 09:14:50 +0200 Subject: Remove vim line It has been agreed that we don't use it any longer. --- roles/openshift_health_checker/openshift_checks/etcd_volume.py | 1 - 1 file changed, 1 deletion(-) diff --git a/roles/openshift_health_checker/openshift_checks/etcd_volume.py b/roles/openshift_health_checker/openshift_checks/etcd_volume.py index cbdf70092..c769af973 100644 --- a/roles/openshift_health_checker/openshift_checks/etcd_volume.py +++ b/roles/openshift_health_checker/openshift_checks/etcd_volume.py @@ -1,4 +1,3 @@ -# vim: expandtab:tabstop=4:shiftwidth=4 """ Ansible module for warning about etcd volume size past a defined threshold. """ -- cgit v1.2.3 From 92377813448ff93e216ee688fb16f99958884b8c Mon Sep 17 00:00:00 2001 From: Rodolfo Carvalho Date: Wed, 10 May 2017 09:19:52 +0200 Subject: int -> float We don't need to convert to int and then to float. Read it as float from the start. --- roles/openshift_health_checker/openshift_checks/etcd_volume.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/roles/openshift_health_checker/openshift_checks/etcd_volume.py b/roles/openshift_health_checker/openshift_checks/etcd_volume.py index c769af973..00e240231 100644 --- a/roles/openshift_health_checker/openshift_checks/etcd_volume.py +++ b/roles/openshift_health_checker/openshift_checks/etcd_volume.py @@ -17,8 +17,8 @@ class EtcdVolume(OpenShiftCheck): ansible_mounts = get_var(task_vars, "ansible_mounts") etcd_mount_path = self._get_etcd_mount_path(ansible_mounts) - etcd_disk_size_available = int(etcd_mount_path["size_available"]) - etcd_disk_size_total = int(etcd_mount_path["size_total"]) + etcd_disk_size_available = float(etcd_mount_path["size_available"]) + etcd_disk_size_total = float(etcd_mount_path["size_total"]) etcd_disk_size_used = etcd_disk_size_total - etcd_disk_size_available size_limit_percent = get_var( @@ -27,7 +27,7 @@ class EtcdVolume(OpenShiftCheck): default=self.etcd_default_size_limit_percent ) - if float(etcd_disk_size_used) / float(etcd_disk_size_total) > size_limit_percent: + if etcd_disk_size_used / etcd_disk_size_total > size_limit_percent: msg = ("Current etcd volume usage ({actual:.2f} GB) for the volume \"{volume}\" " "is greater than the storage limit ({limit:.2f} GB).") msg = msg.format( -- cgit v1.2.3 From 95fd767a8242c66f3d62428cb6fce88e2eca3c13 Mon Sep 17 00:00:00 2001 From: Rodolfo Carvalho Date: Wed, 10 May 2017 17:11:23 +0200 Subject: Update check --- .../openshift_checks/etcd_volume.py | 68 +++++++++++----------- .../test/etcd_volume_test.py | 26 ++++----- 2 files changed, 46 insertions(+), 48 deletions(-) diff --git a/roles/openshift_health_checker/openshift_checks/etcd_volume.py b/roles/openshift_health_checker/openshift_checks/etcd_volume.py index 00e240231..ad88ae44d 100644 --- a/roles/openshift_health_checker/openshift_checks/etcd_volume.py +++ b/roles/openshift_health_checker/openshift_checks/etcd_volume.py @@ -1,57 +1,55 @@ -""" -Ansible module for warning about etcd volume size past a defined threshold. -""" - from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var class EtcdVolume(OpenShiftCheck): - """Ensure disk size for an etcd host does not exceed a defined limit""" + """Ensures etcd storage usage does not exceed a given threshold.""" name = "etcd_volume" tags = ["etcd", "health"] - etcd_default_size_limit_percent = 0.9 + # pylint: disable=invalid-name + default_etcd_device_usage_threshold_percent = 90 + # where to find ectd data, higher priority first. + supported_mount_paths = ["/var/lib/etcd", "/var/lib", "/var", "/"] - def run(self, tmp, task_vars): - ansible_mounts = get_var(task_vars, "ansible_mounts") + @classmethod + def is_active(cls, task_vars): + # TODO: only execute this check on hosts in the 'ectd' group? + # Maybe also 'masters' if there are no standalone etcd hosts? + return super(EtcdVolume, cls).is_active(task_vars) - etcd_mount_path = self._get_etcd_mount_path(ansible_mounts) - etcd_disk_size_available = float(etcd_mount_path["size_available"]) - etcd_disk_size_total = float(etcd_mount_path["size_total"]) - etcd_disk_size_used = etcd_disk_size_total - etcd_disk_size_available + def run(self, tmp, task_vars): + mount_info = self._etcd_mount_info(task_vars) + available = mount_info["size_available"] + total = mount_info["size_total"] + used = total - available - size_limit_percent = get_var( + threshold = get_var( task_vars, - "etcd_disk_size_limit_percent", - default=self.etcd_default_size_limit_percent + "etcd_device_usage_threshold_percent", + default=self.default_etcd_device_usage_threshold_percent ) - if etcd_disk_size_used / etcd_disk_size_total > size_limit_percent: - msg = ("Current etcd volume usage ({actual:.2f} GB) for the volume \"{volume}\" " - "is greater than the storage limit ({limit:.2f} GB).") - msg = msg.format( - actual=self._to_gigabytes(etcd_disk_size_used), - volume=etcd_mount_path["mount"], - limit=self._to_gigabytes(size_limit_percent * etcd_disk_size_total), + used_percent = 100.0 * used / total + + if used_percent > threshold: + device = mount_info.get("device", "unknown") + mount = mount_info.get("mount", "unknown") + msg = "etcd storage usage ({:.1f}%) is above threshold ({:.1f}%). Device: {}, mount: {}.".format( + used_percent, threshold, device, mount ) return {"failed": True, "msg": msg} return {"changed": False} - @staticmethod - def _get_etcd_mount_path(ansible_mounts): - supported_mnt_paths = ["/var/lib/etcd", "/var/lib", "/var", "/"] - available_mnts = {mnt.get("mount"): mnt for mnt in ansible_mounts} + def _etcd_mount_info(self, task_vars): + ansible_mounts = get_var(task_vars, "ansible_mounts") + mounts = {mnt.get("mount"): mnt for mnt in ansible_mounts} - for path in supported_mnt_paths: - if path in available_mnts: - return available_mnts[path] + for path in self.supported_mount_paths: + if path in mounts: + return mounts[path] - paths = ', '.join(sorted(available_mnts)) or 'none' - msg = "Unable to determine available disk space. Paths mounted: {}.".format(paths) + paths = ', '.join(sorted(mounts)) or 'none' + msg = "Unable to find etcd storage mount point. Paths mounted: {}.".format(paths) raise OpenShiftCheckException(msg) - - @staticmethod - def _to_gigabytes(byte_size): - return float(byte_size) / 10.0**9 diff --git a/roles/openshift_health_checker/test/etcd_volume_test.py b/roles/openshift_health_checker/test/etcd_volume_test.py index ff8d0d8d7..917045526 100644 --- a/roles/openshift_health_checker/test/etcd_volume_test.py +++ b/roles/openshift_health_checker/test/etcd_volume_test.py @@ -16,7 +16,7 @@ def test_cannot_determine_available_disk(ansible_mounts, extra_words): with pytest.raises(OpenShiftCheckException) as excinfo: check.run(tmp=None, task_vars=task_vars) - for word in 'determine available disk'.split() + extra_words: + for word in 'Unable to find etcd storage mount point'.split() + extra_words: assert word in str(excinfo.value) @@ -69,12 +69,12 @@ def test_cannot_determine_available_disk(ansible_mounts, extra_words): ]) def test_succeeds_with_recommended_disk_space(size_limit, ansible_mounts): task_vars = dict( - etcd_disk_size_limit_percent=size_limit, + etcd_device_usage_threshold_percent=size_limit, ansible_mounts=ansible_mounts, ) - if task_vars["etcd_disk_size_limit_percent"] is None: - task_vars.pop("etcd_disk_size_limit_percent") + if task_vars["etcd_device_usage_threshold_percent"] is None: + task_vars.pop("etcd_device_usage_threshold_percent") check = EtcdVolume(execute_module=fake_execute_module) result = check.run(tmp=None, task_vars=task_vars) @@ -92,25 +92,25 @@ def test_succeeds_with_recommended_disk_space(size_limit, ansible_mounts): 'size_available': 1 * 10**9, 'size_total': 100 * 10**9, }], - ['90.00 GB'], + ['99.0%'], ), ( - 0.7, + 70.0, [{ 'mount': '/', 'size_available': 1 * 10**6, 'size_total': 5 * 10**9, }], - ['3.50 GB'], + ['100.0%'], ), ( - 0.4, + 40.0, [{ 'mount': '/', 'size_available': 2 * 10**9, 'size_total': 6 * 10**9, }], - ['2.40 GB'], + ['66.7%'], ), ( None, @@ -125,17 +125,17 @@ def test_succeeds_with_recommended_disk_space(size_limit, ansible_mounts): 'size_available': 1 * 10**9, 'size_total': 20 * 10**9, }], - ['18.00 GB'], + ['95.0%'], ), ]) def test_fails_with_insufficient_disk_space(size_limit_percent, ansible_mounts, extra_words): task_vars = dict( - etcd_disk_size_limit_percent=size_limit_percent, + etcd_device_usage_threshold_percent=size_limit_percent, ansible_mounts=ansible_mounts, ) - if task_vars["etcd_disk_size_limit_percent"] is None: - task_vars.pop("etcd_disk_size_limit_percent") + if task_vars["etcd_device_usage_threshold_percent"] is None: + task_vars.pop("etcd_device_usage_threshold_percent") check = EtcdVolume(execute_module=fake_execute_module) result = check.run(tmp=None, task_vars=task_vars) -- cgit v1.2.3 From e149ad8185fdba2eb87282d05f4a24f3e4dd4f26 Mon Sep 17 00:00:00 2001 From: Rodolfo Carvalho Date: Thu, 11 May 2017 12:19:49 +0200 Subject: Add module docstring --- roles/openshift_health_checker/openshift_checks/etcd_volume.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/roles/openshift_health_checker/openshift_checks/etcd_volume.py b/roles/openshift_health_checker/openshift_checks/etcd_volume.py index ad88ae44d..b52a9259f 100644 --- a/roles/openshift_health_checker/openshift_checks/etcd_volume.py +++ b/roles/openshift_health_checker/openshift_checks/etcd_volume.py @@ -1,3 +1,5 @@ +"""A health check for OpenShift clusters.""" + from openshift_checks import OpenShiftCheck, OpenShiftCheckException, get_var -- cgit v1.2.3 From f4e1e89f88c6885a6f0be0e7b3eca296d646502b Mon Sep 17 00:00:00 2001 From: Rodolfo Carvalho Date: Thu, 11 May 2017 12:23:40 +0200 Subject: Make class attribute name shorter --- roles/openshift_health_checker/openshift_checks/etcd_volume.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/roles/openshift_health_checker/openshift_checks/etcd_volume.py b/roles/openshift_health_checker/openshift_checks/etcd_volume.py index b52a9259f..df35ecb33 100644 --- a/roles/openshift_health_checker/openshift_checks/etcd_volume.py +++ b/roles/openshift_health_checker/openshift_checks/etcd_volume.py @@ -9,9 +9,9 @@ class EtcdVolume(OpenShiftCheck): name = "etcd_volume" tags = ["etcd", "health"] - # pylint: disable=invalid-name - default_etcd_device_usage_threshold_percent = 90 - # where to find ectd data, higher priority first. + # Default device usage threshold. Value should be in the range [0, 100]. + default_threshold_percent = 90 + # Where to find ectd data, higher priority first. supported_mount_paths = ["/var/lib/etcd", "/var/lib", "/var", "/"] @classmethod @@ -29,7 +29,7 @@ class EtcdVolume(OpenShiftCheck): threshold = get_var( task_vars, "etcd_device_usage_threshold_percent", - default=self.default_etcd_device_usage_threshold_percent + default=self.default_threshold_percent ) used_percent = 100.0 * used / total -- cgit v1.2.3 From 92c1ef8d62fe4daf96319408f763f1bf33c7916a Mon Sep 17 00:00:00 2001 From: Rodolfo Carvalho Date: Thu, 11 May 2017 12:26:57 +0200 Subject: Update variable name to standard It was agreed to name role variables as `r_ROLE_NAME_VARIABLE_NAME`. Giving it a try. --- roles/openshift_health_checker/openshift_checks/etcd_volume.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/roles/openshift_health_checker/openshift_checks/etcd_volume.py b/roles/openshift_health_checker/openshift_checks/etcd_volume.py index df35ecb33..9e58177a5 100644 --- a/roles/openshift_health_checker/openshift_checks/etcd_volume.py +++ b/roles/openshift_health_checker/openshift_checks/etcd_volume.py @@ -28,7 +28,7 @@ class EtcdVolume(OpenShiftCheck): threshold = get_var( task_vars, - "etcd_device_usage_threshold_percent", + "r_openshift_health_checker_etcd_device_usage_threshold_percent", default=self.default_threshold_percent ) -- cgit v1.2.3 From c46a19ad719efae03362b75865cdf8dbd747b4cb Mon Sep 17 00:00:00 2001 From: juanvallejo Date: Fri, 12 May 2017 17:30:11 -0400 Subject: check if hostname is in list of etcd hosts --- roles/openshift_health_checker/openshift_checks/etcd_volume.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/roles/openshift_health_checker/openshift_checks/etcd_volume.py b/roles/openshift_health_checker/openshift_checks/etcd_volume.py index ad88ae44d..b742f7603 100644 --- a/roles/openshift_health_checker/openshift_checks/etcd_volume.py +++ b/roles/openshift_health_checker/openshift_checks/etcd_volume.py @@ -14,9 +14,10 @@ class EtcdVolume(OpenShiftCheck): @classmethod def is_active(cls, task_vars): - # TODO: only execute this check on hosts in the 'ectd' group? - # Maybe also 'masters' if there are no standalone etcd hosts? - return super(EtcdVolume, cls).is_active(task_vars) + etcd_hosts = get_var(task_vars, "groups", "etcd", default=[]) or get_var(task_vars, "groups", "masters", + default=[]) or [] + is_etcd_host = get_var(task_vars, "ansible_ssh_host") in etcd_hosts + return super(EtcdVolume, cls).is_active(task_vars) and is_etcd_host def run(self, tmp, task_vars): mount_info = self._etcd_mount_info(task_vars) -- cgit v1.2.3 From 8c518fd58724911b90def0e052ec6c21db39a5ee Mon Sep 17 00:00:00 2001 From: juanvallejo Date: Mon, 15 May 2017 16:48:21 -0400 Subject: revert role-specific var name --- roles/openshift_health_checker/openshift_checks/etcd_volume.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/roles/openshift_health_checker/openshift_checks/etcd_volume.py b/roles/openshift_health_checker/openshift_checks/etcd_volume.py index 6db81cbef..7452c9cc1 100644 --- a/roles/openshift_health_checker/openshift_checks/etcd_volume.py +++ b/roles/openshift_health_checker/openshift_checks/etcd_volume.py @@ -29,7 +29,7 @@ class EtcdVolume(OpenShiftCheck): threshold = get_var( task_vars, - "r_openshift_health_checker_etcd_device_usage_threshold_percent", + "etcd_device_usage_threshold_percent", default=self.default_threshold_percent ) -- cgit v1.2.3