summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ansible/Dockerfile14
-rw-r--r--ansible/ansible-build.sh1
-rw-r--r--ansible/ansible-config.sh1
-rwxr-xr-xansible/ansible.sh19
-rw-r--r--docs/maintenance.txt4
-rw-r--r--docs/problems.txt31
-rw-r--r--docs/troubleshooting.txt14
-rw-r--r--docs/webservices.txt2
-rw-r--r--group_vars/all.yml2
-rw-r--r--inventories/production.erb2
-rw-r--r--log.txt48
-rwxr-xr-xlogs/2019.09.26/filter.sh (renamed from logs/filter.sh)0
-rw-r--r--logs/2019.09.26/filters.txt (renamed from logs/filters.txt)0
-rw-r--r--logs/2019.09.26/logs/messages.ipekatrin2 (renamed from logs/2019.09.26/messages.ipekatrin2)0
-rw-r--r--logs/2019.09.26/logs/messages.ipekatrin3 (renamed from logs/2019.09.26/messages.ipekatrin3)0
-rw-r--r--logs/2025.11.03.storage-log.txt140
-rw-r--r--scripts/disaster/gluster_endpoints/add_endpoints.sh17
-rw-r--r--scripts/disaster/gluster_endpoints/backups/ipekatrin1-edited.yaml85
-rw-r--r--scripts/disaster/gluster_endpoints/backups/ipekatrin1.yaml87
-rw-r--r--scripts/disaster/gluster_endpoints/backups/storageclasses_backup_2025-10-29.yaml38
-rw-r--r--scripts/disaster/gluster_endpoints/check_pv.sh50
-rw-r--r--scripts/disaster/gluster_endpoints/find_inline_gluster_in_pods.sh7
-rw-r--r--scripts/disaster/gluster_endpoints/remove_endpoints.sh27
-rw-r--r--scripts/disaster/gluster_endpoints/remove_storageclasses.sh7
-rw-r--r--scripts/disaster/walker.sh73
-rw-r--r--scripts/maintain/gluster/bricks_move_heketi.sh39
-rw-r--r--scripts/maintain/gluster/bricks_populate.sh11
-rw-r--r--scripts/maintain/gluster/heal-walk.sh35
-rw-r--r--setup/configs/openshift.yml27
-rw-r--r--setup/users/htpasswd4
30 files changed, 764 insertions, 21 deletions
diff --git a/ansible/Dockerfile b/ansible/Dockerfile
new file mode 100644
index 0000000..c654f6a
--- /dev/null
+++ b/ansible/Dockerfile
@@ -0,0 +1,14 @@
+FROM cytopia/ansible:2.10-tools
+
+ENV PYTHONDONTWRITEBYTECODE=1 \
+ PYTHONUNBUFFERED=1
+
+RUN apk add --no-cache \
+ bash openssh sshpass rsync curl jq git vim less \
+ py3-netaddr \
+ py3-paramiko py3-cryptography \
+ py3-lxml py3-requests py3-yaml \
+ py3-jmespath py3-xmltodict py3-jsonschema py3-psutil
+
+WORKDIR /work
+CMD ["ansible", "--version"]
diff --git a/ansible/ansible-build.sh b/ansible/ansible-build.sh
new file mode 100644
index 0000000..74576da
--- /dev/null
+++ b/ansible/ansible-build.sh
@@ -0,0 +1 @@
+docker build -t ansible:2.10-cytopia-plus .
diff --git a/ansible/ansible-config.sh b/ansible/ansible-config.sh
new file mode 100644
index 0000000..f1b2367
--- /dev/null
+++ b/ansible/ansible-config.sh
@@ -0,0 +1 @@
+for c in ansible ansible-playbook ansible-vault ansible-config ansible-console ansible-doc ansible-galaxy ansible-inventory ansible-pull ansible-test ansible-connection; do ln -sf /root/ansible.sh /usr/bin/$c; done
diff --git a/ansible/ansible.sh b/ansible/ansible.sh
new file mode 100755
index 0000000..7d50833
--- /dev/null
+++ b/ansible/ansible.sh
@@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+IMG=ansible:2.10-cytopia-plus
+SSH_ARGS=()
+
+#[[ -n "${SSH_AUTH_SOCK:-}" && -S "$SSH_AUTH_SOCK" ]] && SSH_ARGS=(-v "$SSH_AUTH_SOCK":/ssh-agent -e SSH_AUTH_SOCK=/ssh-agent)
+
+
+
+cmd="$(basename "$0")"
+
+exec docker run --rm -it \
+ -u "$(id -u):$(id -g)" \
+ -v "/root/ands":/root/ands -w /root/ands \
+ -v "$HOME/.ssh":/root/.ssh:ro \
+ -v "$HOME/.ansible":/root/.ansible \
+ "${SSH_ARGS[@]}" \
+ "$IMG" "$cmd" "$@" \ No newline at end of file
diff --git a/docs/maintenance.txt b/docs/maintenance.txt
index 9f52e18..c05b10f 100644
--- a/docs/maintenance.txt
+++ b/docs/maintenance.txt
@@ -53,3 +53,7 @@ Unused resources
oc delete image sha256:04afd4d4a0481e1510f12d6d071f1dceddef27416eb922cf524a61281257c66e
* Cleaning old dangling images using docker (on all nodes). Tried and as it seems caused no issues to the operation of the cluster.
docker rmi $(docker images --filter "dangling=true" -q --no-trunc)
+
+ - Cleaning log files over-using inodes, etc.
+ * Volume log files
+ find /var/lib/origin/openshift.local.volumes/plugins/kubernetes.io/ -name '*.log' -delete
diff --git a/docs/problems.txt b/docs/problems.txt
index 3b652ec..49137aa 100644
--- a/docs/problems.txt
+++ b/docs/problems.txt
@@ -170,4 +170,33 @@ Orphaning / pod termination problems in the logs
Scenario:
* Reported on long running pods with persistent volumes (katrin, adai-db)
* Also seems an unrelated set of the problems.
- \ No newline at end of file
+
+
+Evicted Pods
+============
+ Pods are evicted if node running pod becomes unavailable or have not enough resources to run the pod.
+ - It is possible to lookup which resource is likely triggering event by
+ > oc describe node ipekatrin2.ipe.kit.edu
+ Type Status LastHeartbeatTime LastTransitionTime Reason Message
+ ---- ------ ----------------- ------------------ ------ -------
+ OutOfDisk False Tue, 05 Apr 2022 03:24:54 +0200 Tue, 21 Dec 2021 19:09:33 +0100 KubeletHasSufficientDisk kubelet has sufficient disk space available
+ MemoryPressure False Tue, 05 Apr 2022 03:24:54 +0200 Tue, 21 Dec 2021 19:09:33 +0100 KubeletHasSufficientMemory kubelet has sufficient memory available
+ DiskPressure False Tue, 05 Apr 2022 03:24:54 +0200 Mon, 04 Apr 2022 10:00:23 +0200 KubeletHasNoDiskPressure kubelet has no disk pressure
+ Ready True Tue, 05 Apr 2022 03:24:54 +0200 Tue, 21 Dec 2021 19:09:43 +0100 KubeletReady kubelet is posting ready status
+ The latest transition is 'DiskPressure' happened on Apr 04. So, likely disk is an issue.
+
+ - DiskPressure eviction
+ * This might happen because the pod writting to much output to the logs (standard ouput). This logs are stored under '/var/lib/origin/openshift.local.volumes/pods/...'
+ and if growing large might use all the space in '/var' file system. OpenShift is not rotating these logs and have no other mechanisms to prevent large output eventually
+ causing space issues. So, pods have to rate-limit output to stdout. Otherwise, we need to find misbehaving pods which writes too much...
+ * Another problem is 'inode' pressure. This can be checked with 'df' and anything above 80% is definitively a sign of a problem
+ df -i
+ The particular folder with lots of inodes can be found with the following command:
+ { find / -xdev -printf '%h\n' | sort | uniq -c | sort -k 1 -n; } 2>/dev/null
+ Likely there would be some openshift-related volume logs in '/var/lib/origin/openshift.local.volumes/plugins/kubernetes.io/'
+ Check particularly cronJob logs mounting volumes, e.g. various 'adei' stuff. Can be cleaned with
+ find /var/lib/origin/openshift.local.volumes/plugins/kubernetes.io/ -name '*.log' -delete
+
+ - If resource is not available for long time, node will become NotReady and all pods will be evicted. However, short term problems caused by pod itself are likely cause only eviction of this
+ particular pod itself only (once pod evicted disk/memory-space is recalimed, logs are deleted). So, it is possible to find the problematic pod by looking which pod was evicted most frequently.
+
diff --git a/docs/troubleshooting.txt b/docs/troubleshooting.txt
index 315f9f4..0621b25 100644
--- a/docs/troubleshooting.txt
+++ b/docs/troubleshooting.txt
@@ -151,8 +151,17 @@ nodes: domino failures
* This might continue infinitely as one node is gets disconnected after another, pods get rescheduled, and process never stops
* The only solution is to remove temporarily some pods, e.g. ADEI pods could be easily removed and, then, provivisioned back
-pods: very slow scheduling (normal start time in seconds range), failed pods, rogue namespaces, etc...
+pods: failed or very slow scheduling (normal start time in seconds range), failed pods, rogue namespaces, etc...
====
+ - LSDF mounts might cause pod-scheduling to fail
+ * It seems OpenShift tries to index (chroot+chmod) files on mount and timeouts if LSDF volume has too many small files...
+ * Reducing number of files with 'subPath' doesn't help here, but setting more specific 'networkPath' in pv helps
+ * Suggestion is to remove fsGroup from 'dc' definition, but it is added automatically if pods use network volumes,
+ setting volume 'gid' (cifs mount parameters specified in 'mountOptions' in pv definition) to match fsGroup doesn't help either
+ * Timeout seems to be fixed to 2m and is not configurable...
+ * Later versions of OpenShift has 'fsGroupChangePolicy=OnRootMismatch' parameter, but it is not present in 3.9
+ => Honestly, solution is unclear besides reducing number of files or mounting a small share subset with little fieles
+
- OpenShift has numerous problems with clean-up resources after the pods. The problems are more likely to happen on the
heavily loaded systems: cpu, io, interrputs, etc.
* This may be indicated in the logs with various errors reporting inability to stop containers/processes, free network
@@ -450,3 +459,6 @@ Various
- IPMI may cause problems as well. Particularly, the mounted CDrom may start complaining. Easiest is
just to remove it from the running system with
echo 1 > /sys/block/sdd/device/delete
+
+ - 'oc get scc' reports the server doesn't have a resource type "scc"
+ Delete (will be restarted) 'apiserver-*' pod in the 'kube-service-catalog' namespace
diff --git a/docs/webservices.txt b/docs/webservices.txt
index 0edfdeb..f8952ca 100644
--- a/docs/webservices.txt
+++ b/docs/webservices.txt
@@ -55,6 +55,6 @@ Updating/Generating certificates for the router
* New 'router-certs' secret should be created in 'default' namespace. Probably it is better to
modify existing secret than delete/create. However, the strings can't just be copied. Easiest way
is to create a new secret in temporary namespace:
- oc -n test secrets new router-certs tls.crt=kaas.pem tls.key=kaas.key
+ oc -n test secrets new router-certs-2022 tls.crt=kaas-chain-20220121.pem tls.key=kaas.key
and then copy 'tls.crt' and 'tls.key' values over.
* To reload secret, the 'router' pods should be deleted (and automatically re-created by rc).
diff --git a/group_vars/all.yml b/group_vars/all.yml
index aef2251..b3a805d 100644
--- a/group_vars/all.yml
+++ b/group_vars/all.yml
@@ -1,4 +1,4 @@
ansible_ssh_user: root
-ansible_ssh_private_key_file: /home/csa/.ssh/id_dsa
+ansible_ssh_private_key_file: /home/csa/.ssh/id_rsa
glusterfs_version: 312
diff --git a/inventories/production.erb b/inventories/production.erb
index 575a86f..edd92c3 100644
--- a/inventories/production.erb
+++ b/inventories/production.erb
@@ -1,7 +1,9 @@
[masters]
+#ipekatrin2.ipe.kit.edu
ipekatrin[1:2].ipe.kit.edu
[etcd]
+#ipekatrin2.ipe.kit.edu
ipekatrin[1:3].ipe.kit.edu
[simple_storage_nodes]
diff --git a/log.txt b/log.txt
index 6b5de65..8ee02bb 100644
--- a/log.txt
+++ b/log.txt
@@ -1,11 +1,49 @@
-Hardware
---------
- - ipekatrin1: Replaced disk in section 9. LSI software reports all is OK, but hardware led indicates a error (red). Probably indicator is broken.
+ Hardware
+ --------
+ 2024
+ - ipekatrin1: Replaced disk in section 9. LSI software reports all is OK, but hardware led indicates a error (red). Probably indicator is broken.
+
+ 2025.09 (early month)
+ - ipekatrin2: Replaced 3 disks (don't remeber slots). two of them was already once replaced.
+ - Ordered spare disks
+
+ 2025.10.23
+ - ipekatrin2: Noticed and cleared RAID alarm attributed to the battery subsystem.
+ * No apparent problems at the moment. Temperatures are all in order. Battery reports healthy. Systems works as usual.
+
+ 2025.09.28 - 2025.11.03
+ - ipekatrin1: Raid controller failed. The system was not running stable after replacement (disk disconnect after 20-30m operation)
+ - ipekatrin1: Temporarily converted in the master-only node (apps scheduling disabled, glusterfs stopped)
+ - ipekatrin1: New disks (from ipekatrinbackupserv1) were assembled in the RAID, assembled in gluster, and manual (file walk-trough) healing
+ is executed. Expected to take about 2-3 weeks (about 2TB per day rate). No LVM configured, direct mount.
+ - Application node will be recovered once we replace system SSDs with larger ones (as there currently no space for images/containers)
+ and I don't want to put it on new RAID.
+ - Original disks from ipekatrin1 are assembled in ipekatrinbackupserv1. Disconnect problem preserve as some disks stop answerin
+ SENSE queries and backplane restarts a whole bunch of 10 disks. Anyway, all disks are accessible in JBOD mode and can be copied.
+ * XFS fs is severely damaged and needs reapirs. I tried accessing some files via xfs debugger, it worked. So, directory structure
+ and file content is, at least partially, are good and repair should be possible.
+ * If recovery would be necessary: buy 24 new disks, copy one-by-one, assemble in RAID, recover FS.
+
+ 2025.12.08
+ - Copied ipekatrin1 system SSDs to new 4TB drives and reinstalled in the server (only 2TB is used due to MBR limitations)
Software
--------
2023.06.13
- Instructed MySQL slave to ignore 1062 errors as well (I have skipped a few manually, but errors appeared non-stop)
- Also ADEI-KATRIN pod got stuck. Pod was running, but apache was stuck and not replying. This caused POD state to report 'not-ready' but for some reason it was still 'live' and pod was not restarted.
-
- \ No newline at end of file
+
+ 2025.09.28
+ - Restarted degraded GlusterFS nodes and make them work on remaining 2 nodes (1 replica + metadata for most of our storage needs).
+ - Turned out 'database' volume is created in Raid-0 mode and it used backend for KDB database. So, data is gone.
+ - Recovered KDB database from backups and moved it to glusterfs/openshift volume. Nothing left on 'database' volume. Can be turned off.
+
+ 2025.09.28 - 2025.11.03
+ - GlusterFS endpoints temporarily changed to use only ipekatrin2 (see details in dedicated logs)
+ - Heketi and gluster-blockd were disabled and will be not available further. Existing heketi volumes preserved.
+
+ 2025.12.09
+ - Renabled scheduling on ipekatrin1..
+ - Manually run 'adei-clean' on katrin & darwin, but keep 'cron' scripts stopped for now.
+ - Restored configs: fstab restored, */gfs endpoints. Heketi/gluster-block stays disabled. No other system changes.
+ - ToDo: Re-enable 'cron' scripts if we decide to keep system running in parallel with KaaS2.
diff --git a/logs/filter.sh b/logs/2019.09.26/filter.sh
index 675fb90..675fb90 100755
--- a/logs/filter.sh
+++ b/logs/2019.09.26/filter.sh
diff --git a/logs/filters.txt b/logs/2019.09.26/filters.txt
index daf2bab..daf2bab 100644
--- a/logs/filters.txt
+++ b/logs/2019.09.26/filters.txt
diff --git a/logs/2019.09.26/messages.ipekatrin2 b/logs/2019.09.26/logs/messages.ipekatrin2
index 6374da7..6374da7 100644
--- a/logs/2019.09.26/messages.ipekatrin2
+++ b/logs/2019.09.26/logs/messages.ipekatrin2
diff --git a/logs/2019.09.26/messages.ipekatrin3 b/logs/2019.09.26/logs/messages.ipekatrin3
index d497fc6..d497fc6 100644
--- a/logs/2019.09.26/messages.ipekatrin3
+++ b/logs/2019.09.26/logs/messages.ipekatrin3
diff --git a/logs/2025.11.03.storage-log.txt b/logs/2025.11.03.storage-log.txt
new file mode 100644
index 0000000..a95dc57
--- /dev/null
+++ b/logs/2025.11.03.storage-log.txt
@@ -0,0 +1,140 @@
+Status
+======
+ - Raid controller failed on ipekatrin1
+ - The system was not running stable after replacement (disk disconnect after 20-30m operation)
+ - ipekatrin1 was temporarily converted in the master-only node (apps scheduling disabled, glusterfs stopped)
+ - Heketi and gluster-blockd were disabled and will be not available further. Existing heketi volumes preserved.
+ - New disks (from ipekatrinbackupserv1) were assembled in the RAID, assembled in gluster, and manual (file walk-trough) healing
+ is executed. Expected to take about 2-3 weeks (about 2TB per day rate). No LVM configured, direct mount.
+ - Application node will be recovered once we replace system SSDs with larger ones (as there currently no space for images/containers)
+ and I don't want to put it on new RAID.
+
+Recovery Logs
+====
+ 2025.09.28
+ - ipekatrin1:
+ * Raid controller don't see 10 disks and behaves erratically.
+ * Turned of the server and ordered a replacement.
+ - Sotrage:
+ * Restarted degraded GlusterFS nodes and make them work on remaining 2 nodes (1 replica + metadata for most of our storage needs).
+ * Turned out 'database' volume is created in Raid-0 mode and it used backend for KDB database. So, data is gone.
+ * Recovered KDB database from backups and moved it to glusterfs/openshift volume. Nothing left on 'database' volume. Can be turned off.
+
+ 2025.10.23
+ - ipekatrin1:
+ * Replaced RAID controller. Make attempt to rebuild, but disks are disconnected after about 30-40 minutes (recovered after shutoff, not reboot)
+ * Checked power issues: cabling bypassing PSU and monitoring voltages (12V system should not go bellow 11.9V). No change, voltages seemed fine.
+ * Checked cabling issues disconnecting first one cable and then another (supported mode, single cable connects all disks). No change
+ * Tried to imrpove cooling, setting fan speeds to maximum (kept) and even temporarily installing external cooler. Radiators were cool, also checked reported temperatures. No change, still goes down in 30-40 minutes.
+ * Suspect backplane problems. The radiators were quite hot before adjusting cooling. Seems known stability problems due to bad signal management in firmware if overheated. Firmware updates are suggest to stabilize.
+ * No support by SuperMicro. Queried Tootlec about possibility of getting firmware update or/and ordering backplane [Order RG_014523_001_Chilingaryan form 16.12.2016, Angebot 14.10, Contract: 28.11]
+ Hardware: Chassis CSE-846BE2C-R1K28B, Backplan BPN-SAS3-846EL2), 2x MCX353A-FCB ConnectX-3 VPI
+ * KATRINBackupServ1 (3-years older) has backplane with enough bays to mount disks. We still need to be able to put Raid-card and Mellanox ConnectX-3 board/boards with 2 ports (can leave with 1).
+ - ipekatrin2: Noticed and cleared RAID alarm attributed to the battery subsystem.
+ * No apparent problems at the moment. Temperatures are all in order. Battery reports healthy. Systems works as usual.
+ * Setup temperature monitoring of RAID card, currently 76-77C
+
+ 2025.10.27
+ - ipekatrin1:
+ * Disconnected all disks from the server and start preparing it as an application node
+ - Software:
+ * I have temporarily suspended all ADEI cronJobs to avoid resource contention on ipekatrin2 (as restart would be dangerous now) [clean (logs,etc.)/maintain (re-caching,etc.)/update(detecting new databases)]
+ - Research:
+ * DaemonSet/GlusterFS selects nodes based on the following nodeSelector
+ $ oc -n glusterfs get ds glusterfs-storage -o yaml | grep -B 5 -A 5 nodeSelector
+ nodeSelector:
+ glusterfs: storage-host
+ All nodes has corresponding labels in their metadata:
+ $ oc get node/ipekatrin1.ipe.kit.edu --show-labels -o yaml | grep -A 20 labels:
+ labels:
+ ...
+ glusterfs: storage-host
+ ...
+ * Thats removed now from ipekatrin1 and should be recovered if we bring storage back
+ oc label --dry-run node/ipekatrin1.ipe.kit.edu glusterfs-
+ * We further need to remove 192.168.12.1 from 'endpoints/gfs' (per namespaces) to avoid possible problems.
+ * On ipekatrin1, /etc/fstab glusterfs mounts should be changed from 'localhost' to some other server (or commented all-together). GlusterFS mounts
+ should be changed from localhost to (or probably just 12.2 as it only host containing data and going via intermediary makes no sense)
+ 192.168.12.2,192.168.12.3:<vol> /mnt/vol glusterfs defaults,_netdev 0 0
+ * All raid volumes be also temporarily commented in /etc/fstab and systemd
+ systemctl list-units --type=mount | grep gluster
+ * Further configuration changes required to run node without glusterfs causing no damage to the rest of the system
+ GlusterFS might be referenced via: /etc/hosts, /etc/fstab, /etc/systemd/system/*.mount /etc/auto.*, scripts/cron
+ endpoints (per namespace), inline gluster volumes in PV (gloabl),
+ gluster-block endpoints / tcmu gateway list, sc (heketi storageclass) and controllers (ds,deploy,sts); just in case check heketi cm/secrets),
+ - Plan:
+ * Prepare application node [double-check before implementing]
+ + Adjust node label
+ + Edit 'gfs' endpoints in all namespaces.
+ + Check glusterblock/heketi, strange pv's.
+ + Check Ands monitoring & maintenance scirpts
+ + Adjust /etc/fstab and check systemd based mounts. Shall we do soemth with hosts?
+ + /etc/nfs-ganesha on ipekatrin1 & ipekatrin2
+ + Check/change cron & monitoring scipts
+ + Check for backup scripts, it probably written on raid controller.
+ + Grep in OpenShift configs (and /etc globally) just in case
+ + Google above other possible culprits.
+ + Boot ipekatrin1 and check that all is fine
+ * cronJobs
+ > Set affinity to ipekatrin1.
+ > Restart cronJobs (maybe reduce intervals)
+ * copy cluster backups out
+ * ToDo
+ > Ideally eliminating cronJobs all together for rest of KaaS1 life-time and replacing with continuously running cron daemon iside container
+ > Rebuild ipekatrinbackupserv1 as new gluster node (using disks) and try connecting it to the cluster
+
+ 2025.10.28-31
+ - Hardware
+ * Re-assemled ipekatrin1 disks in ipekatrinbackupserv1 backplane using new LSI 9361-8i raid controller. Original LSI 9271-8i removed.
+ * Put old (SAS2) disks from ipekatrinbackupserv1 into ipekatrin1. Imported RAID configs, RAID started and seems works stable using SAS2 setup.
+ - Software
+ * Removed glusterfs & fat_storage labels from ipekatrin1.ipe.kit.edu node
+ oc label node/ipekatrin1.ipe.kit.edu glusterfs-
+ oc label node/ipekatrin1.ipe.kit.edu fat_storage-
+ * Indentified all endpoints used in PVs (no PV specifies IPs directly). No PV hardcode IPs directly (and it seems unsupported anyway)
+ Editied endpoints: gfs glusterfs-dynamic-etcd glusterfs-dynamic-metrics-cassandra-1 glusterfs-dynamic-mongodb glusterfs-dynamic-registry-claim glusterfs-dynamic-sharelatex-docker
+ * Verified that no glusterblock devices is used by pods or outside (no iscsi devics). Checked that heketi storageClass can be safely disabled without affecting existing volumes
+ Teminated heketi/glusterblock services, removed storageclasses
+ * Checked ands-distributed scripts & crons. No referring to gluster. Monitoring checks raid status, but this probably is not critical as it would just report error (which is true)
+ * Set nfsganesha cluster nodes to andstorage2 only on ipekatrin1/2 (no active server on ipekatrin3). Service is inactive at the moment
+ Anyway double-check to disable on ipekatrin1 on a first boot
+ * Found active 'block' volume in glusterfs. Checked it is empty and is not used by any active 'pv'. Stopped and deleted.
+ * Backup is done on /mnt/provision which should work in new configuration. So, no changes are needed.
+ * Mount points adjusted.
+ - First Boot:
+ * Disable nfs-ganesha on first boot on ipekatrin1
+ * Verified that glusterfs is not started and gluster mounts are healthy
+ * etcd is running and seem healthy
+ ETCDCTL_API=3 /usr/bin/etcdctl --cert /etc/etcd/peer.crt --key /etc/etcd/peer.key --cacert /etc/etcd/ca.crt --endpoints https://`hostname`:2379 member list
+ curl -v --cert /etc/etcd/peer.crt --key /etc/etcd/peer.key --cacert /etc/etcd/ca.crt -s https://192.168.13.1:2379/v2/stats/self
+ * origin-master-api and origin-master-controllers are runnign
+ * origin-node and docker failed. /var/lib/docker is on the raid (mounted /var/lib/docker, but used via lvm thin pool).
+ * Created '/var/lib/docker-local for now and configured docker to user overlay2 in /etc/sysconfig/docker-storage
+ DOCKER_STORAGE_OPTIONS="--storage-driver=overlay2 --graph=/var/lib/docker-local"
+ * Adjusted selinux contexts
+ semanage fcontext -a -e /var/lib/docker /var/lib/docker-local
+ restorecon -R -v /var/lib/docker-local
+ * Infrastructure pods are running on ipekatrin1
+ * Check Status and monitoring scripts are working [ seems reasonable to me ]
+ > Raid is not optimal and low data space is report (/mnt/ands is not mounted)
+ > Docker is not reporting available Data/Metadata space (as we are on local folder)
+ * Check /var/lib/docker-local space usage is monitored
+ > Via data space usage
+ - Problems
+ * We have '*-host' pvs bound to /mnt/hostdisk which are used adei/mysql (nodes 2&3) and as katrin temporary data folder. Currently keep node1 as master, but disable scheduling
+ oc adm cordon ipekatrin1.ipe.kit.edu
+ - Backup
+ * Backups from 'provision' volume are taken to 'kaas-manager' VM
+ - Monitor
+ * Usage in /var/lib/docker-local [ space usage ]
+ - ToDo
+ * Try building storage RAID in ipekatrinbackupserv1 (SFF-8643 to SFF-8087 cable needed, RAID-to-backplane). Turn on, check data is accessible and turn-off.
+ * We shall order larger SSD for docker (LVM) and KATRIN temporary files (/mnt/hostraid). Once done, uncordon jobs on katrin2
+ oc adm uncordon ipekatrin1.ipe.kit.edu
+ * We might try building a smaller RAID from stable disk bays and move ADEI replica here (discuss!) or a larger from SAS2 drives if it proves more stable.
+ * We might be able to use Intel RES2SV240 or LSISAS2x28 expander board to reduce SAS3 to SAS2 speeds...
+
+ 2025.11.01-03
+ - Document attempts to recover storage raid
+ - GlusterFS changes and replication
+
diff --git a/scripts/disaster/gluster_endpoints/add_endpoints.sh b/scripts/disaster/gluster_endpoints/add_endpoints.sh
new file mode 100644
index 0000000..4badee9
--- /dev/null
+++ b/scripts/disaster/gluster_endpoints/add_endpoints.sh
@@ -0,0 +1,17 @@
+[[ $# -ne 1 ]] && { echo "Usage: $0 <NEW_NODE_IP>"; exit 1; }
+
+NEW_IP="$1"
+
+oc get namespaces -o name | sed 's/namespaces\///' | \
+while read NS; do
+ if oc -n "$NS" get endpoints gfs &>/dev/null; then
+ echo "✓ Patching $NS/gfs with $NEW_IP"
+# echo oc -n "$NS" patch endpoints gfs --type=strategic --patch="{\"subsets\":[{\"addresses\":[{\"ip\":\"$NEW_IP\"}]}]}"
+# echo oc -n "$NS" patch ep gfs --type=strategic --patch='{"subsets":[{"addresses":[{"ip":"'"$NEW_IP"'"}]}]}'
+ oc -n "$NS" patch ep gfs --type=json -p='[{"op": "add", "path": "/subsets/0/addresses/-", "value": {"ip": "'"$NEW_IP"'"}}]'
+ else
+ echo "✗ No gfs endpoint in $NS (skipping)"
+ fi
+done
+
+echo "Done. Verify: oc get ep gfs -A -o wide" \ No newline at end of file
diff --git a/scripts/disaster/gluster_endpoints/backups/ipekatrin1-edited.yaml b/scripts/disaster/gluster_endpoints/backups/ipekatrin1-edited.yaml
new file mode 100644
index 0000000..6a8dc63
--- /dev/null
+++ b/scripts/disaster/gluster_endpoints/backups/ipekatrin1-edited.yaml
@@ -0,0 +1,85 @@
+apiVersion: v1
+kind: Node
+metadata:
+ annotations:
+ alpha.kubernetes.io/provided-node-ip: 192.168.13.1
+ volumes.kubernetes.io/controller-managed-attach-detach: "true"
+ creationTimestamp: 2018-03-23T04:20:04Z
+ labels:
+ beta.kubernetes.io/arch: amd64
+ beta.kubernetes.io/os: linux
+ compute_node: "0"
+ fat_memory: "0"
+ fqdn: ipekatrin1.ipe.kit.edu
+ gpu_node: "0"
+ hostid: "1"
+ hostname: ipekatrin1
+ kubernetes.io/hostname: ipekatrin1.ipe.kit.edu
+ master: "1"
+ node-role.kubernetes.io/master: "true"
+ openshift-infra: apiserver
+ permanent: "1"
+ pod_node: "1"
+ production: "1"
+ region: infra
+ server: "1"
+ zone: default
+ name: ipekatrin1.ipe.kit.edu
+ resourceVersion: "1138908753"
+ selfLink: /api/v1/nodes/ipekatrin1.ipe.kit.edu
+ uid: 7616a958-2e51-11e8-969e-0cc47adef108
+spec:
+ externalID: ipekatrin1.ipe.kit.edu
+status:
+ addresses:
+ - address: 192.168.13.1
+ type: InternalIP
+ - address: ipekatrin1.ipe.kit.edu
+ type: Hostname
+ allocatable:
+ cpu: "40"
+ memory: 263757760Ki
+ pods: "250"
+ capacity:
+ cpu: "40"
+ memory: 263860160Ki
+ pods: "250"
+ conditions:
+ - lastHeartbeatTime: 2025-10-23T19:01:20Z
+ lastTransitionTime: 2025-10-23T19:02:02Z
+ message: Kubelet stopped posting node status.
+ reason: NodeStatusUnknown
+ status: Unknown
+ type: OutOfDisk
+ - lastHeartbeatTime: 2025-10-23T19:01:20Z
+ lastTransitionTime: 2025-10-23T19:02:02Z
+ message: Kubelet stopped posting node status.
+ reason: NodeStatusUnknown
+ status: Unknown
+ type: MemoryPressure
+ - lastHeartbeatTime: 2025-10-23T19:01:20Z
+ lastTransitionTime: 2025-10-23T19:02:02Z
+ message: Kubelet stopped posting node status.
+ reason: NodeStatusUnknown
+ status: Unknown
+ type: DiskPressure
+ - lastHeartbeatTime: 2025-10-23T19:01:20Z
+ lastTransitionTime: 2025-10-23T19:02:02Z
+ message: Kubelet stopped posting node status.
+ reason: NodeStatusUnknown
+ status: Unknown
+ type: Ready
+ daemonEndpoints:
+ kubeletEndpoint:
+ Port: 10250
+ nodeInfo:
+ architecture: amd64
+ bootID: a87a0b63-abf8-4b1d-9a1a-49197b26817e
+ containerRuntimeVersion: docker://1.12.6
+ kernelVersion: 3.10.0-693.21.1.el7.x86_64
+ kubeProxyVersion: v1.7.6+a08f5eeb62
+ kubeletVersion: v1.7.6+a08f5eeb62
+ machineID: 73b3f7f0088b44adb16582623d7747b1
+ operatingSystem: linux
+ osImage: CentOS Linux 7 (Core)
+ systemUUID: 00000000-0000-0000-0000-0CC47ADEF108
diff --git a/scripts/disaster/gluster_endpoints/backups/ipekatrin1.yaml b/scripts/disaster/gluster_endpoints/backups/ipekatrin1.yaml
new file mode 100644
index 0000000..5e45f12
--- /dev/null
+++ b/scripts/disaster/gluster_endpoints/backups/ipekatrin1.yaml
@@ -0,0 +1,87 @@
+apiVersion: v1
+kind: Node
+metadata:
+ annotations:
+ alpha.kubernetes.io/provided-node-ip: 192.168.13.1
+ volumes.kubernetes.io/controller-managed-attach-detach: "true"
+ creationTimestamp: 2018-03-23T04:20:04Z
+ labels:
+ beta.kubernetes.io/arch: amd64
+ beta.kubernetes.io/os: linux
+ compute_node: "0"
+ fat_memory: "0"
+ fat_storage: "1"
+ fqdn: ipekatrin1.ipe.kit.edu
+ glusterfs: storage-host
+ gpu_node: "0"
+ hostid: "1"
+ hostname: ipekatrin1
+ kubernetes.io/hostname: ipekatrin1.ipe.kit.edu
+ master: "1"
+ node-role.kubernetes.io/master: "true"
+ openshift-infra: apiserver
+ permanent: "1"
+ pod_node: "1"
+ production: "1"
+ region: infra
+ server: "1"
+ zone: default
+ name: ipekatrin1.ipe.kit.edu
+ resourceVersion: "1137118496"
+ selfLink: /api/v1/nodes/ipekatrin1.ipe.kit.edu
+ uid: 7616a958-2e51-11e8-969e-0cc47adef108
+spec:
+ externalID: ipekatrin1.ipe.kit.edu
+status:
+ addresses:
+ - address: 192.168.13.1
+ type: InternalIP
+ - address: ipekatrin1.ipe.kit.edu
+ type: Hostname
+ allocatable:
+ cpu: "40"
+ memory: 263757760Ki
+ pods: "250"
+ capacity:
+ cpu: "40"
+ memory: 263860160Ki
+ pods: "250"
+ conditions:
+ - lastHeartbeatTime: 2025-10-23T19:01:20Z
+ lastTransitionTime: 2025-10-23T19:02:02Z
+ message: Kubelet stopped posting node status.
+ reason: NodeStatusUnknown
+ status: Unknown
+ type: OutOfDisk
+ - lastHeartbeatTime: 2025-10-23T19:01:20Z
+ lastTransitionTime: 2025-10-23T19:02:02Z
+ message: Kubelet stopped posting node status.
+ reason: NodeStatusUnknown
+ status: Unknown
+ type: MemoryPressure
+ - lastHeartbeatTime: 2025-10-23T19:01:20Z
+ lastTransitionTime: 2025-10-23T19:02:02Z
+ message: Kubelet stopped posting node status.
+ reason: NodeStatusUnknown
+ status: Unknown
+ type: DiskPressure
+ - lastHeartbeatTime: 2025-10-23T19:01:20Z
+ lastTransitionTime: 2025-10-23T19:02:02Z
+ message: Kubelet stopped posting node status.
+ reason: NodeStatusUnknown
+ status: Unknown
+ type: Ready
+ daemonEndpoints:
+ kubeletEndpoint:
+ Port: 10250
+ nodeInfo:
+ architecture: amd64
+ bootID: a87a0b63-abf8-4b1d-9a1a-49197b26817e
+ containerRuntimeVersion: docker://1.12.6
+ kernelVersion: 3.10.0-693.21.1.el7.x86_64
+ kubeProxyVersion: v1.7.6+a08f5eeb62
+ kubeletVersion: v1.7.6+a08f5eeb62
+ machineID: 73b3f7f0088b44adb16582623d7747b1
+ operatingSystem: linux
+ osImage: CentOS Linux 7 (Core)
+ systemUUID: 00000000-0000-0000-0000-0CC47ADEF108
diff --git a/scripts/disaster/gluster_endpoints/backups/storageclasses_backup_2025-10-29.yaml b/scripts/disaster/gluster_endpoints/backups/storageclasses_backup_2025-10-29.yaml
new file mode 100644
index 0000000..77e3452
--- /dev/null
+++ b/scripts/disaster/gluster_endpoints/backups/storageclasses_backup_2025-10-29.yaml
@@ -0,0 +1,38 @@
+apiVersion: v1
+items:
+- apiVersion: storage.k8s.io/v1
+ kind: StorageClass
+ metadata:
+ creationTimestamp: 2018-03-23T04:24:52Z
+ name: glusterfs-storage
+ namespace: ""
+ resourceVersion: "6403"
+ selfLink: /apis/storage.k8s.io/v1/storageclasses/glusterfs-storage
+ uid: 219550a3-2e52-11e8-969e-0cc47adef108
+ parameters:
+ resturl: http://heketi-storage.glusterfs.svc.cluster.local:8080
+ restuser: admin
+ secretName: heketi-storage-admin-secret
+ secretNamespace: glusterfs
+ provisioner: kubernetes.io/glusterfs
+- apiVersion: storage.k8s.io/v1
+ kind: StorageClass
+ metadata:
+ creationTimestamp: 2018-03-23T04:25:31Z
+ name: glusterfs-storage-block
+ namespace: ""
+ resourceVersion: "6528"
+ selfLink: /apis/storage.k8s.io/v1/storageclasses/glusterfs-storage-block
+ uid: 38ff5088-2e52-11e8-969e-0cc47adef108
+ parameters:
+ chapauthenabled: "true"
+ hacount: "3"
+ restsecretname: heketi-storage-admin-secret-block
+ restsecretnamespace: glusterfs
+ resturl: http://heketi-storage.glusterfs.svc.cluster.local:8080
+ restuser: admin
+ provisioner: gluster.org/glusterblock
+kind: List
+metadata:
+ resourceVersion: ""
+ selfLink: ""
diff --git a/scripts/disaster/gluster_endpoints/check_pv.sh b/scripts/disaster/gluster_endpoints/check_pv.sh
new file mode 100644
index 0000000..1f2a7e4
--- /dev/null
+++ b/scripts/disaster/gluster_endpoints/check_pv.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+
+pvs=$(oc get pv -o json | jq -r '
+ .items[]
+ | select(.spec.glusterfs?)
+ | select(.spec.glusterfs.endpoints != "gfs")
+ | "\(.metadata.name) → endpoints=\(.spec.glusterfs.endpoints // "NONE")"')
+
+
+echo "PV usage:"
+echo
+
+#pvs=$(oc get pv --no-headers | awk '{print $1}')
+
+for pv in $pvs; do
+ # Extract PVC and namespace bound to PV
+ pvc=$(oc get pv "$pv" -o jsonpath='{.spec.claimRef.name}' 2>/dev/null)
+ ns=$(oc get pv "$pv" -o jsonpath='{.spec.claimRef.namespace}' 2>/dev/null)
+
+ if [[ -z "$pvc" || -z "$ns" ]]; then
+ echo "$pv → UNUSED"
+ echo
+ continue
+ fi
+
+ echo "$pv → PVC: $ns/$pvc"
+
+ # Grep instead of JSONPath filter — much safer
+ pods=$(oc get pods -n "$ns" -o name \
+ | while read -r pod; do
+ oc get "$pod" -n "$ns" -o json \
+ | jq -r --arg pvc "$pvc" '
+ . as $pod |
+ .spec.volumes[]?
+ | select(.persistentVolumeClaim? and .persistentVolumeClaim.claimName == $pvc)
+ | $pod.metadata.name
+ ' 2>/dev/null
+ done \
+ | sort -u
+ )
+
+ if [[ -z "$pods" ]]; then
+ echo " → PVC bound but no running Pod is using it"
+ else
+ echo " → Pods:"
+ echo "$pods" | sed 's/^/ - /'
+ fi
+
+ echo
+done
diff --git a/scripts/disaster/gluster_endpoints/find_inline_gluster_in_pods.sh b/scripts/disaster/gluster_endpoints/find_inline_gluster_in_pods.sh
new file mode 100644
index 0000000..e116fb7
--- /dev/null
+++ b/scripts/disaster/gluster_endpoints/find_inline_gluster_in_pods.sh
@@ -0,0 +1,7 @@
+#! /bin/bash
+
+for p in $(oc get pods --all-namespaces --no-headers | awk '{print $2":"$1}'); do
+ pod=${p%:*}; ns=${p#*:};
+ echo "=== $ns/$pod ==="
+ oc -n "$ns" get pod "$pod" -o json | grep gluster
+done
diff --git a/scripts/disaster/gluster_endpoints/remove_endpoints.sh b/scripts/disaster/gluster_endpoints/remove_endpoints.sh
new file mode 100644
index 0000000..f4623f6
--- /dev/null
+++ b/scripts/disaster/gluster_endpoints/remove_endpoints.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+TARGET_IP="192.168.12.1"
+
+for ns in $(oc get ns --no-headers | awk '{print $1}'); do
+ for epname in gfs glusterfs-dynamic-etcd glusterfs-dynamic-metrics-cassandra-1 glusterfs-dynamic-mongodb glusterfs-dynamic-registry-claim glusterfs-dynamic-sharelatex-docker; do
+ ep=$(oc get endpoints "$epname" -n "$ns" -o json 2>/dev/null) || continue
+
+ modified="$(printf '%s' "$ep" | jq \
+ --arg ip "$TARGET_IP" \
+ '(.subsets[]?.addresses |= map(select(.ip != $ip)))'
+ )"
+
+ if diff <(echo "$ep") <(echo "$modified") >/dev/null; then
+ continue
+ fi
+
+ echo -n "Namespace: $ns/$epname:"
+ echo -n "$ep" | jq '.subsets[].addresses'
+ echo -n " ===> "
+ echo -n "$modified" | jq '.subsets[].addresses'
+ echo
+
+ # When verified, uncomment the following line to APPLY:
+ echo "$modified" | oc replace -f - -n "$ns"
+ done
+done
diff --git a/scripts/disaster/gluster_endpoints/remove_storageclasses.sh b/scripts/disaster/gluster_endpoints/remove_storageclasses.sh
new file mode 100644
index 0000000..063650d
--- /dev/null
+++ b/scripts/disaster/gluster_endpoints/remove_storageclasses.sh
@@ -0,0 +1,7 @@
+# Backups provided
+oc delete sc glusterfs-storage
+oc delete sc glusterfs-storage-block
+
+# It was a single replica
+oc scale dc/glusterblock-storage-provisioner-dc -n glusterfs --replicas=0
+oc scale dc/heketi-storage -n glusterfs --replicas=0
diff --git a/scripts/disaster/walker.sh b/scripts/disaster/walker.sh
new file mode 100644
index 0000000..0211105
--- /dev/null
+++ b/scripts/disaster/walker.sh
@@ -0,0 +1,73 @@
+#! /bin/bash
+
+
+#find /mnt/provision/kaas/adei -type f -print0 | xargs -0 -I{} -n 1 sh -c ' dd if="$1" of=/dev/null bs=1M status=none || true; sleep .5' _ "{}"
+
+#find /mnt/ands/glusterfs/brick-provision/kaas/bora -type f -size 0 -print0 | \
+#while IFS= read -r -d '' f; do
+# echo "Remvoing $f"
+# setfattr -x trusted.glusterfs.mdata "$f" 2>/dev/null || true
+# for a in $(getfattr -d -m trusted.afr -e hex "$f" 2>/dev/null | awk -F= '/trusted\.afr/{print $1}'); do
+# setfattr -x "$a" "$f" 2>/dev/null || true
+# done
+#done
+
+#echo 3 | sudo tee /proc/sys/vm/drop_caches
+#find /mnt/wave/ -type f -print0 | xargs -0 -I{} -n 1 -P 8 sh -c '
+# f="$1"
+# dd if="$f" of=/dev/null bs=1M status=none || true;
+# sz=$(stat -c%s "$f" 2>/dev/null || echo 0)
+# echo "$f $sz"
+# if [ "$sz" -eq 0 ]; then
+# # give gluster a breath and try again, like you do manually
+# sleep 0.5
+# dd if="$f" of=/dev/null bs=1M status=none 2>/dev/null || true
+## sz=$(stat -c%s "$f" 2>/dev/null || echo 0)
+# fi
+# ' _ "{}"
+
+#find /mnt/datastore/services/gogs -type f -print0 | xargs -0 -n200 -P16 rm -
+#find /mnt/datastore/services/gogs -depth -type d -empty -delete
+#find /mnt/datastore/services/gogs/repositories -maxdepth 1 -mindepth 1 -type d -print0 | xargs -0 -I{} -n1 -P200 sh -c 'rm -rf "$1"' _ "{}"
+
+
+#echo 3 | sudo tee /proc/sys/vm/drop_caches
+#find /mnt/ands/glusterfs/brick-katrin_data -name .glusterfs -prune -o -type f -size 0 -print0 | xargs -0 -I{} -n 1 -P 8 sh -c '
+# fbrick="$1"
+# brick_prefix="/mnt/ands/glusterfs/brick-katrin_data"
+# mount_prefix="/mnt/katrin"
+# fmount="${fbrick/#$brick_prefix/$mount_prefix}"
+# dd if="$fmount" of=/dev/null bs=1M status=none || true;
+# sz=$(stat -c%s "$fbrick" 2>/dev/null || echo 0)
+# echo "$fmount $sz"
+# if [ "$sz" -eq 0 ]; then
+# # give gluster a breath and try again, like you do manually
+# sleep 0.5
+# dd if="$fmount" of=/dev/null bs=1M status=none 2>/dev/null || true
+## sz=$(stat -c%s "$fbrick" 2>/dev/null || echo 0)
+# fi
+# ' _ "{}"
+#
+
+echo 3 | sudo tee /proc/sys/vm/drop_caches
+find /mnt/ands/glusterfs/brick-katrin_data -name .glusterfs -prune -o -type f -print0 | xargs -0 -I{} -n 1 -P 8 sh -c '
+ fbrick="$1"
+ mount_prefix="/mnt/katrin"
+ brick_prefix="/mnt/ands/glusterfs/brick-katrin_data"
+ fmount="${fbrick/#$brick_prefix/$mount_prefix}"
+ szbrick=$(stat -c%s "$fbrick" 2>/dev/null || echo 0)
+ szmount=$(stat -c%s "$fmount" 2>/dev/null || echo 0)
+ if [ $szbrick -ne $szmount ]; then
+ dd if="$fmount" of=/dev/null bs=1M status=none 2>/dev/null || true
+ sz=$(stat -c%s "$fbrick" 2>/dev/null || echo 0)
+ while [ $sz -ne $szmount ]; do
+ echo "* $fmount $szmount $szbrick => $sz"
+ sleep 1
+ dd if="$fmount" of=/dev/null bs=1M status=none 2>/dev/null || true
+ sz=$(stat -c%s "$fbrick" 2>/dev/null || echo 0)
+ done
+ echo "$fmount $szmount $szbrick => $sz"
+ fi
+ ' _ "{}"
+
+
diff --git a/scripts/maintain/gluster/bricks_move_heketi.sh b/scripts/maintain/gluster/bricks_move_heketi.sh
new file mode 100644
index 0000000..36b8602
--- /dev/null
+++ b/scripts/maintain/gluster/bricks_move_heketi.sh
@@ -0,0 +1,39 @@
+HOST="192.168.12.1"
+NEW_BASE="/mnt/ands/glusterfs/vg_ce3a7c1bb6da5c98ce4bb3e76aeacb8b"
+GLUSTER_BIN="gluster"
+DRYRUN=1 # set to 0 to actually run
+GLUSTER_UID=107 # adjust if your gluster user has a different uid/gid
+
+# get all volumes like vol_<uid>
+VOLS=$($GLUSTER_BIN volume list | grep '^vol_')
+
+for VOL in $VOLS; do
+ # find bricks on this host
+ # lines look like: "Brick2: 192.168.12.1:/var/lib/heketi/.../brick"
+ mapfile -t OLDBRICKS < <($GLUSTER_BIN volume info "$VOL" \
+ | grep "$HOST:" \
+ | awk '{print $2}')
+
+ # skip volumes that don't have a brick on this host
+ if [ ${#OLDBRICKS[@]} -eq 0 ]; then
+ continue
+ fi
+
+ for OLD in "${OLDBRICKS[@]}"; do
+ BRICKID=$(echo "$OLD" | sed -n 's#.*/\(brick_[^/]*\)/brick#\1#p')
+ if [ -z "$BRICKID" ]; then
+ echo "WARN: could not extract brick ID from $OLD"
+ continue
+ fi
+
+ NEW="$HOST:$NEW_BASE/$BRICKID"
+
+ echo "=== volume: $VOL ==="
+ echo "old brick: $OLD"
+ echo "new brick: $NEW"
+
+
+ $GLUSTER_BIN volume replace-brick "$VOL" "$OLD" "$NEW" commit force
+
+ done
+done
diff --git a/scripts/maintain/gluster/bricks_populate.sh b/scripts/maintain/gluster/bricks_populate.sh
new file mode 100644
index 0000000..15790a1
--- /dev/null
+++ b/scripts/maintain/gluster/bricks_populate.sh
@@ -0,0 +1,11 @@
+for brick in brick-*; do
+ [ -d $brick/.glusterfs ] && continue
+ name=${brick#brick-}
+
+ echo "$name - $brick"
+
+ setfattr -n trusted.gfid -v 0sAAAAAAAAAAAAAAAAAAAAAQ== /mnt/ands/glusterfs/$brick
+ setfattr -n trusted.glusterfs.volume-id -v 0x$(gluster volume info $name | grep 'Volume ID' | awk '{print $3}' | tr -d '-') /mnt/ands/glusterfs/$brick
+ mkdir -p /mnt/ands/glusterfs/$brick/.glusterfs/{indices,exports,xattrop,locks}
+
+done
diff --git a/scripts/maintain/gluster/heal-walk.sh b/scripts/maintain/gluster/heal-walk.sh
new file mode 100644
index 0000000..4c8d134
--- /dev/null
+++ b/scripts/maintain/gluster/heal-walk.sh
@@ -0,0 +1,35 @@
+#! /bin/bash
+
+
+#find /mnt/provision/kaas/adei -type f -print0 | xargs -0 -I{} -n 1 sh -c ' dd if="$1" of=/dev/null bs=1M status=none || true; sleep .5' _ "{}"
+
+#find /mnt/ands/glusterfs/brick-provision/kaas/bora -type f -size 0 -print0 | \
+#while IFS= read -r -d '' f; do
+# echo "Remvoing $f"
+# setfattr -x trusted.glusterfs.mdata "$f" 2>/dev/null || true
+# for a in $(getfattr -d -m trusted.afr -e hex "$f" 2>/dev/null | awk -F= '/trusted\.afr/{print $1}'); do
+# setfattr -x "$a" "$f" 2>/dev/null || true
+# done
+#done
+
+#find /mnt/datastore/services/gogs -type f -print0 | xargs -0 -n200 -P16 rm -
+#find /mnt/datastore/services/gogs -depth -type d -empty -delete
+#find /mnt/datastore/services/gogs/repositories -maxdepth 1 -mindepth 1 -type d -print0 | xargs -0 -I{} -n1 -P200 sh -c 'rm -rf "$1"' _ "{}"
+
+
+echo 3 | sudo tee /proc/sys/vm/drop_caches
+find /mnt/wave/ -type f -print0 | xargs -0 -I{} -n 1 -P 8 sh -c '
+ f="$1"
+ dd if="$f" of=/dev/null bs=1M status=none || true;
+ sz=$(stat -c%s "$f" 2>/dev/null || echo 0)
+ echo "$f $sz"
+ if [ "$sz" -eq 0 ]; then
+ # give gluster a breath and try again, like you do manually
+ sleep 0.5
+ dd if="$f" of=/dev/null bs=1M status=none 2>/dev/null || true
+# sz=$(stat -c%s "$f" 2>/dev/null || echo 0)
+ fi
+ ' _ "{}"
+
+
+#find /mnt/wave/ -type f -print0 | xargs -0 -I{} -n 1 -P 8 sh -c 'echo $1; dd if="$1" of=/dev/null bs=1M status=none || true; sleep .5' _ {}
diff --git a/setup/configs/openshift.yml b/setup/configs/openshift.yml
index 7ead691..b6b0a04 100644
--- a/setup/configs/openshift.yml
+++ b/setup/configs/openshift.yml
@@ -23,31 +23,34 @@ ands_openshift_users:
pdv: { name: "IPE Administation Account" }
katrin: { name: "KATRIN Project" }
csa: { name: "Suren A. Chilingaryan", email: "csa@suren.me", uid: "1001", shell: "/bin/bash" }
+ jan: { name: "Jan Behrens", email: "jan.behrens@kit.edu" }
kopmann: { name: "Andreas Kopmann", email: "kopmann@kit.edu" }
ntj: { name: "Nicholas Tan Jerome", email: "nicholas.jerome@kit.edu" }
jonasteufel: { name: "Jonas Teufel", email: "jonseb1998@gmail.com" }
jalal: { name: "Jalal Mostafa", email: "jalal.mostapha@outlook.com" }
gil: { name: "Woosik Gil", email: "gil@kit.edu" }
jhar: { name: "Julius Hartmann", email: "julius.hartmann@kit.edu" }
-
+ bbieringer: { name: "Benedikt Bieringer", email: "benedikt.b@uni-muenster.de" }
+ fynn: { name: "Fynn Tschacher", email: "fynn.tschacher@student.kit.edu" }
ands_openshift_roles:
- cluster-admin: csa, jalal
- kaas/admin: csa, jalal, kopmann
- katrin/admin: katrin, jalal
- status/admin: katrin, jalal
+ cluster-admin: csa, jalal, jan, ntj
+ kaas/admin: csa, jalal, kopmann, ntj, bbieringer
+ katrin/admin: katrin, jalal, jan, ntj, bbieringer, fynn
+ status/admin: katrin, jalal, ntj, bbieringer
adei/admin: csa
- adei/view: pdv, kopmann, jalal
- adei/kaas-maintain: pdv, kopmann, jalal
+ adei/view: pdv, kopmann, jalal, ntj
+ adei/kaas-maintain: pdv, kopmann, jalal, ntj
adai/admin: csa, kopmann, jalal
bora/admin: csa, ntj, gil, jalal, katrin, kopmann
epics/admin: csa, jalal
wave/admin: csa, ntj
- services/admin: csa, jalal, katrin
- web/admin: kopmann, jonasteufel, jalal
- mon/admin: csa, jalal
- test/admin: csa, ntj, kopmann, katrin, jalal
- jupyter/admin: csa, kopmann, jalal, jhar
+ services/admin: csa, jalal, katrin, jan, ntj, bbieringer
+ web/admin: kopmann, jonasteufel, jalal, ntj, bbieringer
+ mon/admin: csa, jalal, ntj
+ test/admin: csa, ntj, kopmann, katrin, jalal, jan
+ jupyter/admin: csa, kopmann, jalal, jhar, ntj
+ hpc/admin: csa, jalal, fawez
ands_repos:
docker: "{{ ands_info.git_url }}/ands-docker"
diff --git a/setup/users/htpasswd b/setup/users/htpasswd
index 4939761..f1027f4 100644
--- a/setup/users/htpasswd
+++ b/setup/users/htpasswd
@@ -1,9 +1,13 @@
pdv:$apr1$ACvj6uUa$Nm1Vq8hZq3RzTtaYpAHv01
csa:$apr1$IqEwdnzy$UAdd8ZSFnXommBbj29w3c0
katrin:$apr1$94lAgTxt$LVOWdwye92nsZVqVT7VaG1
+jan:$2y$05$RJqQQczSaO0sIiYLVI3CI.mKPIOvxnJUTYbUjyXqjJjLQJQinBlQu
ntj:$apr1$G5/ThWdp$kFLsj/hO9jIYYP.Zab9kC/
kopmann:$apr1$jU8jCdPh$u7ZUBiT3gzxlf1xPJl6FI.
jonasteufel:$apr1$2dsiiZ1p$Us/5i8DEt9fxeliGy7L6h/
jalal:$apr1$hwKRrL2x$RbtSQbfZZqPuvHL9YhCKp.
gil:$apr1$p2khs49v$7poH4dUbTpCyhEO5JmgLx0
jhar:$apr1$pDAXDbT4$r2f1SP5D71KplWZKLNi27.
+fawez:$apr1$yqROFhQ9$QdvOPFZ3zAbmtI9Dv53WU.
+bbieringer:$apr1$JqJxhUCR$5KfXdYvZzBFBbNm.mnrtd.
+fynn:$2y$05$3jGd00iMyFrLLAsFOrfRBOM5pEBnM1U6zIso/bM5NzNM3pmiKIa86