diff options
6 files changed, 268 insertions, 0 deletions
diff --git a/playbooks/adhoc/docker_loopback_to_lvm/docker-storage-setup b/playbooks/adhoc/docker_loopback_to_lvm/docker-storage-setup new file mode 100644 index 000000000..059058823 --- /dev/null +++ b/playbooks/adhoc/docker_loopback_to_lvm/docker-storage-setup @@ -0,0 +1,2 @@ +DEVS=/dev/xvdb +VG=docker_vg diff --git a/playbooks/adhoc/docker_loopback_to_lvm/docker_loopback_to_direct_lvm.yml b/playbooks/adhoc/docker_loopback_to_lvm/docker_loopback_to_direct_lvm.yml new file mode 100644 index 000000000..c9ae923bb --- /dev/null +++ b/playbooks/adhoc/docker_loopback_to_lvm/docker_loopback_to_direct_lvm.yml @@ -0,0 +1,151 @@ +--- +# This playbook coverts docker to go from loopback to direct-lvm (the Red Hat recommended way to run docker) +# in AWS. This adds an additional EBS volume and creates the Volume Group on this EBS volume to use. +# +# To run: +# 1. Source your AWS credentials (make sure it's the corresponding AWS account) into your environment +# export AWS_ACCESS_KEY_ID='XXXXX' +# export AWS_SECRET_ACCESS_KEY='XXXXXX' +# +# 2. run the playbook: +# ansible-playbook -e 'cli_tag_name=<tag-name>' -e "cli_volume_size=30" docker_loopback_to_direct_lvm.yml +# +# Example: +# ansible-playbook -e 'cli_tag_name=ops-master-12345' -e "cli_volume_size=30" docker_loopback_to_direct_lvm.yml +# +# Notes: +# * By default this will do a 30GB volume. +# * iops are calculated by Disk Size * 30. e.g ( 30GB * 30) = 900 iops +# * This will remove /var/lib/docker! +# * You may need to re-deploy docker images after this is run (like monitoring) +# + +- name: Fix docker to have a provisioned iops drive + hosts: "tag_Name_{{ cli_tag_name }}" + user: root + connection: ssh + gather_facts: no + + vars: + cli_volume_type: io1 + cli_volume_size: 30 + cli_volume_iops: "{{ 30 * cli_volume_size }}" + + pre_tasks: + - fail: + msg: "This playbook requires {{item}} to be set." + when: "{{ item }} is not defined or {{ item }} == ''" + with_items: + - cli_tag_name + - cli_volume_size + + - debug: + var: hosts + + - name: start docker + service: + name: docker + state: started + + - name: Determine if loopback + shell: docker info | grep 'Data file:.*loop' + register: loop_device_check + ignore_errors: yes + + - debug: + var: loop_device_check + + - name: fail if we don't detect loopback + fail: + msg: loopback not detected! Please investigate manually. + when: loop_device_check.rc == 1 + + - name: stop zagg client monitoring container + service: + name: oso-rhel7-zagg-client + state: stopped + ignore_errors: yes + + - name: stop pcp client monitoring container + service: + name: oso-f22-host-monitoring + state: stopped + ignore_errors: yes + + - name: stop docker + service: + name: docker + state: stopped + + - name: delete /var/lib/docker + command: rm -rf /var/lib/docker + + - name: remove /var/lib/docker + command: rm -rf /var/lib/docker + + - name: check to see if /dev/xvdb exists + command: test -e /dev/xvdb + register: xvdb_check + ignore_errors: yes + + - debug: var=xvdb_check + + - name: fail if /dev/xvdb already exists + fail: + msg: /dev/xvdb already exists. Please investigate + when: xvdb_check.rc == 0 + + - name: Create a volume and attach it + delegate_to: localhost + ec2_vol: + state: present + instance: "{{ ec2_id }}" + region: "{{ ec2_region }}" + volume_size: "{{ cli_volume_size | default(30, True)}}" + volume_type: "{{ cli_volume_type }}" + device_name: /dev/xvdb + iops: "{{ 30 * cli_volume_size }}" + register: vol + + - debug: var=vol + + - name: tag the vol with a name + delegate_to: localhost + ec2_tag: region={{ ec2_region }} resource={{ vol.volume_id }} + args: + tags: + Name: "{{ ec2_tag_Name }}" + env: "{{ ec2_tag_environment }}" + register: voltags + + - name: Wait for volume to attach + pause: + seconds: 30 + + - name: copy the docker-storage-setup config file + copy: + src: docker-storage-setup + dest: /etc/sysconfig/docker-storage-setup + owner: root + group: root + mode: 0664 + + - name: docker storage setup + command: docker-storage-setup + register: setup_output + + - debug: var=setup_output + + - name: start docker + command: systemctl start docker.service + register: dockerstart + + - debug: var=dockerstart + + - name: Wait for docker to stabilize + pause: + seconds: 30 + + # leaving off the '-t' for docker exec. With it, it doesn't work with ansible and tty support + - name: update zabbix docker items + command: docker exec -i oso-rhel7-zagg-client /usr/local/bin/cron-send-docker-metrics.py diff --git a/playbooks/adhoc/docker_storage_cleanup/docker_storage_cleanup.yml b/playbooks/adhoc/docker_storage_cleanup/docker_storage_cleanup.yml new file mode 100644 index 000000000..1946a5f4f --- /dev/null +++ b/playbooks/adhoc/docker_storage_cleanup/docker_storage_cleanup.yml @@ -0,0 +1,69 @@ +--- +# This playbook attempts to cleanup unwanted docker files to help alleviate docker disk space issues. +# +# To run: +# +# 1. run the playbook: +# +# ansible-playbook -e 'cli_tag_name=<tag-name>' docker_storage_cleanup.yml +# +# Example: +# +# ansible-playbook -e 'cli_tag_name=ops-node-compute-12345' docker_storage_cleanup.yml +# +# Notes: +# * This *should* not interfere with running docker images +# + +- name: Clean up Docker Storage + gather_facts: no + hosts: "tag_Name_{{ cli_tag_name }}" + user: root + connection: ssh + + pre_tasks: + + - fail: + msg: "This playbook requires {{item}} to be set." + when: "{{ item }} is not defined or {{ item }} == ''" + with_items: + - cli_tag_name + + - name: Ensure docker is running + service: + name: docker + state: started + enabled: yes + + - name: Get docker info + command: docker info + register: docker_info + + - name: Show docker info + debug: + var: docker_info.stdout_lines + + - name: Remove exited and dead containers + shell: "docker ps -a | awk '/Exited|Dead/ {print $1}' | xargs --no-run-if-empty docker rm" + ignore_errors: yes + + - name: Remove dangling docker images + shell: "docker images -q -f dangling=true | xargs --no-run-if-empty docker rmi" + ignore_errors: yes + + - name: Remove non-running docker images + shell: "docker images -aq | xargs --no-run-if-empty docker rmi 2>/dev/null" + ignore_errors: yes + + # leaving off the '-t' for docker exec. With it, it doesn't work with ansible and tty support + - name: update zabbix docker items + command: docker exec -i oso-rhel7-zagg-client /usr/local/bin/cron-send-docker-metrics.py + + # Get and show docker info again. + - name: Get docker info + command: docker info + register: docker_info + + - name: Show docker info + debug: + var: docker_info.stdout_lines diff --git a/roles/os_zabbix/tasks/main.yml b/roles/os_zabbix/tasks/main.yml index f9672b9c4..8347e9a61 100644 --- a/roles/os_zabbix/tasks/main.yml +++ b/roles/os_zabbix/tasks/main.yml @@ -11,6 +11,7 @@ - include_vars: template_os_linux.yml - include_vars: template_docker.yml - include_vars: template_openshift_master.yml +- include_vars: template_openshift_node.yml - name: Include Template Heartbeat include: ../../lib_zabbix/tasks/create_template.yml @@ -43,3 +44,11 @@ server: "{{ ozb_server }}" user: "{{ ozb_user }}" password: "{{ ozb_password }}" + +- name: Include Template Openshift Node + include: ../../lib_zabbix/tasks/create_template.yml + vars: + template: "{{ g_template_openshift_node }}" + server: "{{ ozb_server }}" + user: "{{ ozb_user }}" + password: "{{ ozb_password }}" diff --git a/roles/os_zabbix/vars/template_openshift_master.yml b/roles/os_zabbix/vars/template_openshift_master.yml index 728423ac1..c71e07910 100644 --- a/roles/os_zabbix/vars/template_openshift_master.yml +++ b/roles/os_zabbix/vars/template_openshift_master.yml @@ -6,8 +6,25 @@ g_template_openshift_master: applications: - Openshift Master key: create_app + + - key: openshift.master.process.count + description: Shows number of master processes running + type: int + applications: + - Openshift Master + ztriggers: - name: 'Application creation has failed on {HOST.NAME}' expression: '{Template Openshift Master:create_app.last(#1)}=1 and {Template Openshift Master:create_app.last(#2)}=1' url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/check_create_app.asciidoc' priority: avg + + - name: 'Openshift Master process not running on {HOST.NAME}' + expression: '{Template Openshift Master:openshift.master.process.count.max(#3)}<1' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' + priority: high + + - name: 'Too many Openshift Master processes running on {HOST.NAME}' + expression: '{Template Openshift Master:openshift.master.process.count.min(#3)}>1' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_master.asciidoc' + priority: high diff --git a/roles/os_zabbix/vars/template_openshift_node.yml b/roles/os_zabbix/vars/template_openshift_node.yml new file mode 100644 index 000000000..36f9cc4a3 --- /dev/null +++ b/roles/os_zabbix/vars/template_openshift_node.yml @@ -0,0 +1,20 @@ +--- +g_template_openshift_node: + name: Template Openshift Node + zitems: + - key: openshift.node.process.count + description: Shows number of OpenShift Node processes running + type: int + applications: + - Openshift Node + + ztriggers: + - name: 'Openshift Node process not running on {HOST.NAME}' + expression: '{Template Openshift Node:openshift.node.process.count.max(#3)}<1' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_node.asciidoc' + priority: high + + - name: 'Too many Openshift Node processes running on {HOST.NAME}' + expression: '{Template Openshift Node:openshift.node.process.count.min(#3)}>1' + url: 'https://github.com/openshift/ops-sop/blob/master/V3/Alerts/openshift_node.asciidoc' + priority: high |