Merge pull request #7097 from ewolinetz/logging_fresh_lg_cluster_fix

Automatic merge from submit-queue. Whenever we create a new es node ignore health checks, changing prome… …theus pw gen for increased secret idempotency Addresses https://bugzilla.redhat.com/show_bug.cgi?id=1540099 Whenever we are in a cluster sized > 1 the nodes required for recovery > 1. So when we have a fresh install we will not see the cluster start up because the number of required nodes is not met. Whenever we are creating a new node, we do not wait for the health check so that the logging playbook can complete and we can roll out all updated nodes. Also addresses prometheus pw generation so that each rerun of the playbook doesn't change the secret which triggers a full rollout of the cluster (assumes that keys/certs have changed).
author: OpenShift Merge Robot <openshift-merge-robot@users.noreply.github.com> 2018-02-14 14:28:33 -0800
committer: GitHub <noreply@github.com> 2018-02-14 14:28:33 -0800
commit: b62c397f0625b9ff3654347a1777ed2277942712 (patch)
tree: 950a36359a9ac5e7d4a0b692ccdaf43e6f106463
parent: deb9a793cbb169b964424720f9c3a6ce6b976b09 (diff)
parent: 61df593d2047995f25327e54b32956944f413100 (diff)
4 files changed, 29 insertions, 10 deletions
diff --git a/roles/openshift_logging/tasks/install_logging.yaml b/roles/openshift_logging/tasks/install_logging.yaml
index 9fabc5826..66dd2f5a3 100644
--- a/roles/openshift_logging/tasks/install_logging.yaml
+++ b/roles/openshift_logging/tasks/install_logging.yaml
@@ -131,6 +131,7 @@
     openshift_logging_elasticsearch_storage_type: "{{ elasticsearch_storage_type | default(default_elasticsearch_storage_type) }}"
     openshift_logging_elasticsearch_pvc_pv_selector: "{{ openshift_logging_es_pv_selector }}"
     openshift_logging_elasticsearch_pvc_storage_class_name: "{{ openshift_logging_es_pvc_storage_class_name | default() }}"
+    __logging_scale_up: True
 
   with_sequence: count={{ openshift_logging_es_cluster_size | int - openshift_logging_facts.elasticsearch.deploymentconfigs.keys() | count }}
   loop_control:
@@ -221,6 +222,7 @@
     openshift_logging_es_hostname: "{{ openshift_logging_es_ops_hostname }}"
     openshift_logging_es_edge_term_policy: "{{ openshift_logging_es_ops_edge_term_policy | default('') }}"
     openshift_logging_es_allow_external: "{{ openshift_logging_es_ops_allow_external }}"
+    __logging_ops_scale_up: True
 
   with_sequence: count={{ openshift_logging_es_ops_cluster_size | int - openshift_logging_facts.elasticsearch_ops.deploymentconfigs.keys() | count }}
   loop_control:
diff --git a/roles/openshift_logging_elasticsearch/tasks/main.yaml b/roles/openshift_logging_elasticsearch/tasks/main.yaml
index 64e5a3a1f..441460b2d 100644
--- a/roles/openshift_logging_elasticsearch/tasks/main.yaml
+++ b/roles/openshift_logging_elasticsearch/tasks/main.yaml
@@ -138,15 +138,22 @@
   - "prometheus_out.stderr | length > 0"
   - "'already exists' not in prometheus_out.stderr"
 
-- set_fact:
-    _logging_metrics_proxy_passwd: "{{ 16 | lib_utils_oo_random_word | b64encode }}"
+- name: Checking for passwd.yml
+  stat: path="{{ generated_certs_dir }}/passwd.yml"
+  register: passwd_file
+  check_mode: no
 
-- template:
+- when: not passwd_file.stat.exists
+  template:
     src: passwd.j2
-    dest: "{{mktemp.stdout}}/passwd.yml"
+    dest: "{{ generated_certs_dir }}/passwd.yml"
   vars:
     logging_user_name: "{{ openshift_logging_elasticsearch_prometheus_sa }}"
-    logging_user_passwd: "{{ _logging_metrics_proxy_passwd }}"
+    logging_user_passwd: "{{ 16 | lib_utils_oo_random_word | b64encode }}"
+
+- slurp:
+    src: "{{ generated_certs_dir }}/passwd.yml"
+  register: _logging_metrics_proxy_passwd
 
 # View role and binding
 - name: Generate logging-elasticsearch-view-role
@@ -296,7 +303,7 @@
     - name: admin.jks
       path: "{{ generated_certs_dir }}/system.admin.jks"
     - name: passwd.yml
-      path: "{{mktemp.stdout}}/passwd.yml"
+      path: "{{ generated_certs_dir }}/passwd.yml"
 
 # services
 - name: Set logging-{{ es_component }}-cluster service
@@ -433,7 +440,7 @@
     es_container_security_context: "{{ _es_containers.elasticsearch.securityContext if _es_containers is defined and 'elasticsearch' in _es_containers and 'securityContext' in _es_containers.elasticsearch else None }}"
     deploy_type: "{{ openshift_logging_elasticsearch_deployment_type }}"
     es_replicas: 1
-    basic_auth_passwd: "{{ _logging_metrics_proxy_passwd | b64decode }}"
+    basic_auth_passwd: "{{ ( _logging_metrics_proxy_passwd['content'] | b64decode | from_yaml )[openshift_logging_elasticsearch_prometheus_sa]['passwd'] }}"
     es_number_of_shards: "{{ openshift_logging_es_number_of_shards | default(1) }}"
     es_number_of_replicas: "{{ openshift_logging_es_number_of_replicas| default(0) }}"
 
diff --git a/roles/openshift_logging_elasticsearch/tasks/restart_cluster.yml b/roles/openshift_logging_elasticsearch/tasks/restart_cluster.yml
index 14f2313e1..01247dd5d 100644
--- a/roles/openshift_logging_elasticsearch/tasks/restart_cluster.yml
+++ b/roles/openshift_logging_elasticsearch/tasks/restart_cluster.yml
@@ -65,6 +65,12 @@
       {{ openshift_client_binary }} get dc -l component={{ _cluster_component }},provider=openshift -n {{ openshift_logging_elasticsearch_namespace }} -o jsonpath={.items[*].metadata.name}
     register: _cluster_dcs
 
+  # If we are currently restarting the "es" cluster we want to check if we are scaling up the number of es nodes
+  # If we are currently restarting the "es-ops" cluster we want to check if we are scaling up the number of ops nodes
+  # If we've created a new node for that cluster then the appropriate variable will be true, otherwise we default to false
+  - set_fact:
+      _skip_healthcheck: "{{ __logging_scale_up | default(false) if _cluster_component == 'es' else __logging_ops_scale_up | default(false) }}"
+
   ## restart all dcs for full restart
   - name: "Restart ES node {{ _es_node }}"
     include_tasks: restart_es_node.yml
@@ -94,6 +100,7 @@
       {{ openshift_client_binary }} exec {{ _cluster_pods.stdout.split(' ')[0] }} -c elasticsearch -n {{ openshift_logging_elasticsearch_namespace }} -- {{ __es_local_curl }} -XPUT 'https://localhost:9200/_cluster/settings' -d '{ "transient": { "cluster.routing.allocation.enable" : "all" } }'
     register: _enable_output
     changed_when: "'\"acknowledged\":true' in _enable_output.stdout"
+    when: _cluster_pods.stdout != ""
 
   # Reenable external communication for {{ _cluster_component }}
   - name: Reenable external communication for logging-{{ _cluster_component }}
diff --git a/roles/openshift_logging_elasticsearch/tasks/restart_es_node.yml b/roles/openshift_logging_elasticsearch/tasks/restart_es_node.yml
index a1e172168..934ab886b 100644
--- a/roles/openshift_logging_elasticsearch/tasks/restart_es_node.yml
+++ b/roles/openshift_logging_elasticsearch/tasks/restart_es_node.yml
@@ -3,7 +3,8 @@
   command: >
     {{ openshift_client_binary }} rollout latest {{ _es_node }} -n {{ openshift_logging_elasticsearch_namespace }}
 
-- name: "Waiting for {{ _es_node }} to finish scaling up"
+- when: not _skip_healthcheck | bool
+  name: "Waiting for {{ _es_node }} to finish scaling up"
   oc_obj:
     state: list
     name: "{{ _es_node }}"
@@ -19,12 +20,14 @@
   retries: 60
   delay: 30
 
-- name: Gettings name(s) of replica pod(s)
+- when: not _skip_healthcheck | bool
+  name: Gettings name(s) of replica pod(s)
   command: >
     {{ openshift_client_binary }} get pods -l deploymentconfig={{ _es_node }} -n {{ openshift_logging_elasticsearch_namespace }} -o jsonpath={.items[*].metadata.name}
   register: _pods
 
-- name: "Waiting for ES to be ready for {{ _es_node }}"
+- when: not _skip_healthcheck | bool
+  name: "Waiting for ES to be ready for {{ _es_node }}"
   shell: >
     {{ openshift_client_binary }} exec "{{ _pod }}" -c elasticsearch -n "{{ openshift_logging_elasticsearch_namespace }}" -- es_cluster_health
   with_items: "{{ _pods.stdout.split(' ') }}"
author	OpenShift Merge Robot <openshift-merge-robot@users.noreply.github.com>	2018-02-14 14:28:33 -0800
committer	GitHub <noreply@github.com>	2018-02-14 14:28:33 -0800
commit	b62c397f0625b9ff3654347a1777ed2277942712 (patch)
tree	950a36359a9ac5e7d4a0b692ccdaf43e6f106463
parent	deb9a793cbb169b964424720f9c3a6ce6b976b09 (diff)
parent	61df593d2047995f25327e54b32956944f413100 (diff)