From 4b5d8d2dc25dbca20be59f3d5d111d737fd865bc Mon Sep 17 00:00:00 2001 From: Scott Dodson Date: Tue, 1 Aug 2017 12:55:47 -0400 Subject: Switch to migrating one host and forming a new cluster With large datasets where there are many keys with TTLs the expiry was creating a data inconsistency problem. The hope is that by performing the migration once and then forming a new cluster this is avoided. Fixes https://bugzilla.redhat.com/show_bug.cgi?id=1475351 --- playbooks/common/openshift-etcd/migrate.yml | 65 ++++++++++++++++++++++++----- playbooks/common/openshift-etcd/scaleup.yml | 13 ++++++ 2 files changed, 68 insertions(+), 10 deletions(-) (limited to 'playbooks/common') diff --git a/playbooks/common/openshift-etcd/migrate.yml b/playbooks/common/openshift-etcd/migrate.yml index 3e7a48669..311ff84b6 100644 --- a/playbooks/common/openshift-etcd/migrate.yml +++ b/playbooks/common/openshift-etcd/migrate.yml @@ -17,18 +17,14 @@ tags: - always +# TODO: This will be different for release-3.6 branch - name: Prepare masters for etcd data migration hosts: oo_masters_to_config tasks: - - set_fact: - master_services: - - "{{ openshift.common.service_type + '-master' }}" - set_fact: master_services: - "{{ openshift.common.service_type + '-master-controllers' }}" - "{{ openshift.common.service_type + '-master-api' }}" - when: - - (openshift_master_cluster_method is defined and openshift_master_cluster_method == "native") or openshift.common.is_master_system_container | bool - debug: msg: "master service name: {{ master_services }}" - name: Stop masters @@ -67,16 +63,59 @@ when: - etcd_backup_failed | length > 0 -- name: Migrate etcd data from v2 to v3 +- name: Stop etcd hosts: oo_etcd_to_migrate gather_facts: no tags: - always + pre_tasks: + - set_fact: + l_etcd_service: "{{ 'etcd_container' if openshift.common.is_containerized else 'etcd' }}" + - name: Disable etcd members + service: + name: "{{ l_etcd_service }}" + state: stopped + +- name: Migrate data on first etcd + hosts: oo_etcd_to_migrate[0] + gather_facts: no + tags: + - always roles: - role: etcd_migrate r_etcd_migrate_action: migrate r_etcd_common_embedded_etcd: "{{ groups.oo_etcd_to_config | default([]) | length == 0 }}" etcd_peer: "{{ ansible_default_ipv4.address }}" + etcd_url_scheme: "https" + etcd_peer_url_scheme: "https" + +- name: Clean data stores on remaining etcd hosts + hosts: oo_etcd_to_migrate[1:] + gather_facts: no + tags: + - always + roles: + - role: etcd_migrate + r_etcd_migrate_action: clean_data + r_etcd_common_embedded_etcd: "{{ groups.oo_etcd_to_config | default([]) | length == 0 }}" + etcd_peer: "{{ ansible_default_ipv4.address }}" + etcd_url_scheme: "https" + etcd_peer_url_scheme: "https" + post_tasks: + - name: Add etcd hosts + delegate_to: localhost + add_host: + name: "{{ item }}" + groups: oo_new_etcd_to_config + ansible_ssh_user: "{{ g_ssh_user | default(omit) }}" + ansible_become: "{{ g_sudo | default(omit) }}" + with_items: "{{ groups.oo_etcd_to_migrate[1:] | default([]) }}" + changed_when: no + - name: Set success + set_fact: + r_etcd_migrate_success: true + +- include: ./scaleup.yml - name: Gate on etcd migration hosts: oo_masters_to_config @@ -89,6 +128,16 @@ - set_fact: etcd_migration_failed: "{{ groups.oo_etcd_to_migrate | difference(etcd_migration_completed) }}" +- name: Add TTLs on the first master + hosts: oo_first_master[0] + roles: + - role: etcd_migrate + r_etcd_migrate_action: add_ttls + etcd_peer: "{{ hostvars[groups.oo_etcd_to_migrate.0].ansible_default_ipv4.address }}" + etcd_url_scheme: "https" + etcd_peer_url_scheme: "https" + when: etcd_migration_failed | length == 0 + - name: Configure masters if etcd data migration is succesfull hosts: oo_masters_to_config roles: @@ -100,10 +149,6 @@ msg: "Skipping master re-configuration since migration failed." when: - etcd_migration_failed | length > 0 - -- name: Start masters after etcd data migration - hosts: oo_masters_to_config - tasks: - name: Start master services service: name: "{{ item }}" diff --git a/playbooks/common/openshift-etcd/scaleup.yml b/playbooks/common/openshift-etcd/scaleup.yml index 192305bc8..52b90daca 100644 --- a/playbooks/common/openshift-etcd/scaleup.yml +++ b/playbooks/common/openshift-etcd/scaleup.yml @@ -24,6 +24,9 @@ member add {{ etcd_hostname }} {{ etcd_peer_url_scheme }}://{{ etcd_ip }}:{{ etcd_peer_port }} delegate_to: "{{ etcd_ca_host }}" register: etcd_add_check + retries: 3 + delay: 10 + until: etcd_add_check.rc == 0 roles: - role: openshift_etcd when: etcd_add_check.rc == 0 @@ -36,3 +39,13 @@ r_etcd_common_etcd_runtime: "{{ openshift.common.etcd_runtime }}" - role: nickhammond.logrotate when: etcd_add_check.rc == 0 + post_tasks: + - name: Verify cluster is stable + command: > + /usr/bin/etcdctl --cert-file {{ etcd_peer_cert_file }} + --key-file {{ etcd_peer_key_file }} + --ca-file {{ etcd_peer_ca_file }} + -C {{ etcd_peer_url_scheme }}://{{ hostvars[etcd_ca_host].etcd_hostname }}:{{ etcd_client_port }} + cluster-health + retries: 1 + delay: 30 -- cgit v1.2.3