From 83378c05f3aaa67e36a3e7577784351fdb711c57 Mon Sep 17 00:00:00 2001 From: Eric Wolinetz Date: Mon, 15 Jan 2018 12:18:08 -0600 Subject: Only automatically restart if cluster is in yellow or green state --- .../callback_plugins/installer_checkpoint.py | 4 + .../tasks/restart_cluster.yml | 160 ++++++++++++--------- .../tasks/restart_es_node.yml | 4 +- 3 files changed, 97 insertions(+), 71 deletions(-) (limited to 'roles') diff --git a/roles/installer_checkpoint/callback_plugins/installer_checkpoint.py b/roles/installer_checkpoint/callback_plugins/installer_checkpoint.py index da7e7b1da..a38b95c1d 100644 --- a/roles/installer_checkpoint/callback_plugins/installer_checkpoint.py +++ b/roles/installer_checkpoint/callback_plugins/installer_checkpoint.py @@ -127,6 +127,10 @@ class CallbackModule(CallbackBase): self._display.display( '\tThis phase can be restarted by running: {}'.format( phase_attributes[phase]['playbook'])) + if 'message' in stats.custom['_run'][phase]: + self._display.display( + '\t{}'.format( + stats.custom['_run'][phase]['message'])) self._display.display("", screen_only=True) diff --git a/roles/openshift_logging_elasticsearch/tasks/restart_cluster.yml b/roles/openshift_logging_elasticsearch/tasks/restart_cluster.yml index 6bce13d1d..879459cf6 100644 --- a/roles/openshift_logging_elasticsearch/tasks/restart_cluster.yml +++ b/roles/openshift_logging_elasticsearch/tasks/restart_cluster.yml @@ -1,91 +1,113 @@ --- -# Disable external communication for {{ _cluster_component }} -- name: Disable external communication for logging-{{ _cluster_component }} - oc_service: - state: present - name: "logging-{{ _cluster_component }}" - namespace: "{{ openshift_logging_elasticsearch_namespace }}" - selector: - component: "{{ _cluster_component }}" - provider: openshift - connection: blocked - labels: - logging-infra: 'support' - ports: - - port: 9200 - targetPort: "restapi" - when: - - full_restart_cluster | bool - ## get all pods for the cluster - command: > oc get pod -l component={{ _cluster_component }},provider=openshift -n {{ openshift_logging_elasticsearch_namespace }} -o jsonpath={.items[?(@.status.phase==\"Running\")].metadata.name} register: _cluster_pods -- name: "Disable shard balancing for logging-{{ _cluster_component }} cluster" - command: > - oc exec {{ _cluster_pods.stdout.split(' ')[0] }} -c elasticsearch -n {{ openshift_logging_elasticsearch_namespace }} -- {{ __es_local_curl }} -XPUT 'https://localhost:9200/_cluster/settings' -d '{ "transient": { "cluster.routing.allocation.enable" : "none" } }' - register: _disable_output - changed_when: "'\"acknowledged\":true' in _disable_output.stdout" +### Check for cluster state before making changes -- if its red then we don't want to continue +- name: "Checking current health for {{ _es_node }} cluster" + shell: > + oc exec "{{ _cluster_pods.stdout.split(' ')[0] }}" -c elasticsearch -n "{{ openshift_logging_elasticsearch_namespace }}" -- es_cluster_health + register: _pod_status when: _cluster_pods.stdout_lines | count > 0 -# Flush ES -- name: "Flushing for logging-{{ _cluster_component }} cluster" - command: > - oc exec {{ _cluster_pods.stdout.split(' ')[0] }} -c elasticsearch -n {{ openshift_logging_elasticsearch_namespace }} -- {{ __es_local_curl }} -XPUT 'https://localhost:9200/_flush/synced' - register: _flush_output - changed_when: "'\"acknowledged\":true' in _flush_output.stdout" - when: +- when: + - _pod_status.stdout is defined + - (_pod_status.stdout | from_json)['status'] in ['red'] + block: + - name: Set Logging message to manually restart + run_once: true + set_stats: + data: + installer_phase_logging: + message: "Cluster logging-{{ _cluster_component }} was in a red state and will not be automatically restarted. Please see documentation regarding doing a {{ 'full' if full_restart_cluster | bool else 'rolling'}} cluster restart." + + - debug: msg="Cluster logging-{{ _cluster_component }} was in a red state and will not be automatically restarted. Please see documentation regarding doing a {{ 'full' if full_restart_cluster | bool else 'rolling'}} cluster restart." + +- when: _pod_status.stdout is undefined or (_pod_status.stdout | from_json)['status'] in ['green', 'yellow'] + block: + # Disable external communication for {{ _cluster_component }} + - name: Disable external communication for logging-{{ _cluster_component }} + oc_service: + state: present + name: "logging-{{ _cluster_component }}" + namespace: "{{ openshift_logging_elasticsearch_namespace }}" + selector: + component: "{{ _cluster_component }}" + provider: openshift + connection: blocked + labels: + logging-infra: 'support' + ports: + - port: 9200 + targetPort: "restapi" + when: + - full_restart_cluster | bool + + - name: "Disable shard balancing for logging-{{ _cluster_component }} cluster" + command: > + oc exec {{ _cluster_pods.stdout.split(' ')[0] }} -c elasticsearch -n {{ openshift_logging_elasticsearch_namespace }} -- {{ __es_local_curl }} -XPUT 'https://localhost:9200/_cluster/settings' -d '{ "transient": { "cluster.routing.allocation.enable" : "none" } }' + register: _disable_output + changed_when: "'\"acknowledged\":true' in _disable_output.stdout" + when: _cluster_pods.stdout_lines | count > 0 + + # Flush ES + - name: "Flushing for logging-{{ _cluster_component }} cluster" + command: > + oc exec {{ _cluster_pods.stdout.split(' ')[0] }} -c elasticsearch -n {{ openshift_logging_elasticsearch_namespace }} -- {{ __es_local_curl }} -XPUT 'https://localhost:9200/_flush/synced' + register: _flush_output + changed_when: "'\"acknowledged\":true' in _flush_output.stdout" + when: - _cluster_pods.stdout_lines | count > 0 - full_restart_cluster | bool -- command: > - oc get dc -l component={{ _cluster_component }},provider=openshift -n {{ openshift_logging_elasticsearch_namespace }} -o jsonpath={.items[*].metadata.name} - register: _cluster_dcs + - command: > + oc get dc -l component={{ _cluster_component }},provider=openshift -n {{ openshift_logging_elasticsearch_namespace }} -o jsonpath={.items[*].metadata.name} + register: _cluster_dcs -## restart all dcs for full restart -- name: "Restart ES node {{ _es_node }}" - include_tasks: restart_es_node.yml - with_items: "{{ _cluster_dcs }}" - loop_control: - loop_var: _es_node - when: + ## restart all dcs for full restart + - name: "Restart ES node {{ _es_node }}" + include_tasks: restart_es_node.yml + with_items: "{{ _cluster_dcs }}" + loop_control: + loop_var: _es_node + when: - full_restart_cluster | bool -## restart the node if it's dc is in the list of nodes to restart? -- name: "Restart ES node {{ _es_node }}" - include_tasks: restart_es_node.yml - with_items: "{{ _restart_logging_nodes }}" - loop_control: - loop_var: _es_node - when: + ## restart the node if it's dc is in the list of nodes to restart? + - name: "Restart ES node {{ _es_node }}" + include_tasks: restart_es_node.yml + with_items: "{{ _restart_logging_nodes }}" + loop_control: + loop_var: _es_node + when: - not full_restart_cluster | bool - _es_node in _cluster_dcs.stdout -## we may need a new first pod to run against -- fetch them all again -- command: > - oc get pod -l component={{ _cluster_component }},provider=openshift -n {{ openshift_logging_elasticsearch_namespace }} -o jsonpath={.items[?(@.status.phase==\"Running\")].metadata.name} - register: _cluster_pods + ## we may need a new first pod to run against -- fetch them all again + - command: > + oc get pod -l component={{ _cluster_component }},provider=openshift -n {{ openshift_logging_elasticsearch_namespace }} -o jsonpath={.items[?(@.status.phase==\"Running\")].metadata.name} + register: _cluster_pods -- name: "Enable shard balancing for logging-{{ _cluster_component }} cluster" - command: > - oc exec {{ _cluster_pods.stdout.split(' ')[0] }} -c elasticsearch -n {{ openshift_logging_elasticsearch_namespace }} -- {{ __es_local_curl }} -XPUT 'https://localhost:9200/_cluster/settings' -d '{ "transient": { "cluster.routing.allocation.enable" : "all" } }' - register: _enable_output - changed_when: "'\"acknowledged\":true' in _enable_output.stdout" + - name: "Enable shard balancing for logging-{{ _cluster_component }} cluster" + command: > + oc exec {{ _cluster_pods.stdout.split(' ')[0] }} -c elasticsearch -n {{ openshift_logging_elasticsearch_namespace }} -- {{ __es_local_curl }} -XPUT 'https://localhost:9200/_cluster/settings' -d '{ "transient": { "cluster.routing.allocation.enable" : "all" } }' + register: _enable_output + changed_when: "'\"acknowledged\":true' in _enable_output.stdout" -# Reenable external communication for {{ _cluster_component }} -- name: Reenable external communication for logging-{{ _cluster_component }} - oc_service: - state: present - name: "logging-{{ _cluster_component }}" - namespace: "{{ openshift_logging_elasticsearch_namespace }}" - selector: - component: "{{ _cluster_component }}" - provider: openshift - labels: - logging-infra: 'support' - ports: + # Reenable external communication for {{ _cluster_component }} + - name: Reenable external communication for logging-{{ _cluster_component }} + oc_service: + state: present + name: "logging-{{ _cluster_component }}" + namespace: "{{ openshift_logging_elasticsearch_namespace }}" + selector: + component: "{{ _cluster_component }}" + provider: openshift + labels: + logging-infra: 'support' + ports: - port: 9200 targetPort: "restapi" - when: + when: - full_restart_cluster | bool diff --git a/roles/openshift_logging_elasticsearch/tasks/restart_es_node.yml b/roles/openshift_logging_elasticsearch/tasks/restart_es_node.yml index 6d0df40c8..fe15e40fd 100644 --- a/roles/openshift_logging_elasticsearch/tasks/restart_es_node.yml +++ b/roles/openshift_logging_elasticsearch/tasks/restart_es_node.yml @@ -26,12 +26,12 @@ - name: "Waiting for ES to be ready for {{ _es_node }}" shell: > - oc exec "{{ _pod }}" -c elasticsearch -n "{{ openshift_logging_elasticsearch_namespace }}" -- {{ __es_local_curl }} https://localhost:9200/_cat/health | cut -d' ' -f4 + oc exec "{{ _pod }}" -c elasticsearch -n "{{ openshift_logging_elasticsearch_namespace }}" -- es_cluster_health with_items: "{{ _pods.stdout.split(' ') }}" loop_control: loop_var: _pod register: _pod_status - until: _pod_status.stdout in ['green', 'yellow'] + until: (_pod_status.stdout | from_json)['status'] in ['green', 'yellow'] retries: 60 delay: 5 changed_when: false -- cgit v1.2.3