From 1e8153c2af051ce48d5aa08d3dbdc0d0970ea532 Mon Sep 17 00:00:00 2001 From: "Suren A. Chilingaryan" Date: Wed, 22 Jan 2020 03:16:06 +0100 Subject: Document another problem with lost IPs and exhausting of SDN IP range --- .../templates/scripts/check_server_status.sh.j2 | 6 ++++-- .../ands_monitor/templates/scripts/clean_sdn_ips.sh.j2 | 17 +++++++++++++++++ 2 files changed, 21 insertions(+), 2 deletions(-) create mode 100755 roles/ands_monitor/templates/scripts/clean_sdn_ips.sh.j2 (limited to 'roles/ands_monitor/templates/scripts') diff --git a/roles/ands_monitor/templates/scripts/check_server_status.sh.j2 b/roles/ands_monitor/templates/scripts/check_server_status.sh.j2 index c2849f4..e49ec97 100755 --- a/roles/ands_monitor/templates/scripts/check_server_status.sh.j2 +++ b/roles/ands_monitor/templates/scripts/check_server_status.sh.j2 @@ -4,6 +4,8 @@ fs=`df -lm / | grep -vi Filesystem | sed -e 's/[[:space:]]\+/ /g' | cut -d ' ' - datafs=`df -lm /mnt/ands | grep -vi Filesystem | sed -e 's/[[:space:]]\+/ /g' | cut -d ' ' -f 4` mem=`free -g | grep "Mem" | sed -e 's/[[:space:]]\+/ /g' | cut -d ' ' -f 7` cpu=`uptime | sed -e "s/[[:space:]]/\n/g" -e s/,/./g | tail -n 1` +max_cpu=$(cat /proc/cpuinfo | grep processor | tail -n 1 | cut -d ':' -f 2) +cpu_usage=$(echo "100 * $cpu / ( $max_cpu + 1)" | bc) #" if [ $fs -le 8192 ]; then echo "Only $(($fs / 1024)) GB left in the root file system" @@ -17,8 +19,8 @@ if [ $mem -le 16 ]; then echo "The system is starving on memory, $mem GB left free" fi -if [ `echo "$cpu < 20" | bc` -eq 0 ]; then - echo "The system is starving on cpu, $cpu is load average for the last 15 min" +if [ `echo "$cpu_usage < 80" | bc` -eq 0 ]; then + echo "The system is starving on cpu, $cpu ($cpu_usage%) is load average for the last 15 min" fi vol=$(/opt/MegaRAID/storcli/storcli64 /c0/v0 show | grep -P "^0/0" | grep "Optl" | wc -l) diff --git a/roles/ands_monitor/templates/scripts/clean_sdn_ips.sh.j2 b/roles/ands_monitor/templates/scripts/clean_sdn_ips.sh.j2 new file mode 100755 index 0000000..c938121 --- /dev/null +++ b/roles/ands_monitor/templates/scripts/clean_sdn_ips.sh.j2 @@ -0,0 +1,17 @@ +#! /bin/bash + +host=$(uname -n) + +# Check node is in the cluster and we have permissions to access OpenShift +oc get node "$host" &> /dev/null +[ $? -ne 0 ] && { echo "Can't query node $host, check cluster configuration and permissions"; exit; } + +oc adm manage-node "$host" --schedulable=false &> /dev/null +[ $? -ne 0 ] && { echo "Failed to disable scheduling on the node $host"; exit; } + +for hash in $(find /var/lib/cni/networks/openshift-sdn/* -mmin +120 -print0 | xargs -0 tail -n +1 | grep '^[A-Za-z0-9]*$' | cut -c 1-8); do if [ -z $(docker ps -a | grep $hash | awk '{print $1}') ]; then grep -ilr $hash ./; fi; done | xargs rm + +systemctl restart origin-node + +oc adm manage-node "$host" --schedulable=true &> /dev/null +[ $? -ne 0 ] && echo "Failed to re-nablee scheduling on the node $host" -- cgit v1.2.3