From d01411c3c4af6c724a3800c621804ea979818d6d Mon Sep 17 00:00:00 2001 From: Peter Mikus Date: Thu, 10 Oct 2019 15:31:28 +0000 Subject: Cleanup via Ansible + Remove dependency on topo_ scripts that depends on custom SSH() that depends on framework itself. This way the cleanup is independent of failure in our SSH libs. + Simple ansible command can do cleanup of a machine: ansible-playbook --inventory inventories/lf_inventory/hosts site.yaml \ --limit '10.32.8.18' --tags 'cleanup' + Add vpp_device reset and cleanup. + Remove historical scripts. - Still in testing beta phase. - Need to add SRIOV cleanup. Signed-off-by: Peter Mikus Change-Id: I68e23304c7ad01041f51263c328c6e8d9b555cb7 --- .../ansible/roles/cleanup/files/reset_vppdevice.sh | 113 +++++++++++++++++++++ .../roles/cleanup/tasks/kill_containers.yaml | 28 +++++ .../ansible/roles/cleanup/tasks/kill_process.yaml | 27 +++++ .../ansible/roles/cleanup/tasks/main.yaml | 31 ++++++ .../roles/cleanup/tasks/remove_package.yaml | 31 ++++++ .../ansible/roles/cleanup/tasks/sut.yaml | 52 ++++++++++ .../ansible/roles/cleanup/tasks/tg.yaml | 14 +++ .../ansible/roles/cleanup/tasks/vpp_device.yaml | 15 +++ 8 files changed, 311 insertions(+) create mode 100644 resources/tools/testbed-setup/ansible/roles/cleanup/files/reset_vppdevice.sh create mode 100644 resources/tools/testbed-setup/ansible/roles/cleanup/tasks/kill_containers.yaml create mode 100644 resources/tools/testbed-setup/ansible/roles/cleanup/tasks/kill_process.yaml create mode 100644 resources/tools/testbed-setup/ansible/roles/cleanup/tasks/main.yaml create mode 100644 resources/tools/testbed-setup/ansible/roles/cleanup/tasks/remove_package.yaml create mode 100644 resources/tools/testbed-setup/ansible/roles/cleanup/tasks/sut.yaml create mode 100644 resources/tools/testbed-setup/ansible/roles/cleanup/tasks/tg.yaml create mode 100644 resources/tools/testbed-setup/ansible/roles/cleanup/tasks/vpp_device.yaml (limited to 'resources/tools/testbed-setup/ansible/roles') diff --git a/resources/tools/testbed-setup/ansible/roles/cleanup/files/reset_vppdevice.sh b/resources/tools/testbed-setup/ansible/roles/cleanup/files/reset_vppdevice.sh new file mode 100644 index 0000000000..ede2db1273 --- /dev/null +++ b/resources/tools/testbed-setup/ansible/roles/cleanup/files/reset_vppdevice.sh @@ -0,0 +1,113 @@ +#!/usr/bin/env bash + +set -euo pipefail + +function die () { + # Print the message to standard error end exit with error code specified + # by the second argument. + # + # Hardcoded values: + # - The default error message. + # Arguments: + # - ${1} - The whole error message, be sure to quote. Optional + # - ${2} - the code to exit with, default: 1. + + set +eu + warn "${1:-Unspecified run-time error occurred!}" + exit "${2:-1}" +} + + +function set_eligibility_off { + # Set Nomad eligibility to ineligible for scheduling. Fail otherwise. + + set -euo pipefail + + node_id="$(nomad node status | grep $(hostname) | cut -d ' ' -f 1)" || die + node_status="$(nomad node status | grep $(hostname))" || die + + if [[ "${node_status}" != *"ineligible"* ]]; then + nomad node eligibility -disable "${node_id}" || die + node_status="$(nomad node status | grep $(hostname))" || die + if [[ "${node_status}" != *"ineligible"* ]]; then + die "Set eligibility off failed!" + fi + fi +} + + +function set_eligibility_on { + # Set Nomad eligibility to eligible for scheduling. Fail otherwise. + + set -euo pipefail + + node_id="$(nomad node status | grep $(hostname) | cut -d ' ' -f 1)" || die + node_status="$(nomad node status | grep $(hostname))" || die + + if [[ "${node_status}" == *"ineligible"* ]]; then + nomad node eligibility -enable "${node_id}" || die + node_status="$(nomad node status | grep $(hostname))" || die + if [[ "${node_status}" == *"ineligible"* ]]; then + die "Set eligibility on failed!" + fi + fi +} + + +function restart_vfs_service { + # Stop and start VF serice. This will reinitialize VFs and driver mappings. + + set -euo pipefail + + warn "Restarting VFs service (this may take few minutes)..." + sudo service csit-initialize-vfs stop || die "Failed to stop VFs service!" + sudo service csit-initialize-vfs start || die "Failed to start VFs service!" +} + + +function wait_for_pending_containers { + # Wait in loop for defined amount of time for pending containers to + # gracefully quit them. If parameter force is specified. Force kill them. + + # Arguments: + # - ${@} - Script parameters. + + set -euo pipefail + + retries=60 + wait_time=60 + containers=(docker ps --quiet --filter name=csit*) + + for i in $(seq 1 ${retries}); do + mapfile -t pending_containers < <( ${containers[@]} ) || die + warn "Waiting for pending containers [${pending_containers[@]}] ..." + if [ ${#pending_containers[@]} -eq 0 ]; then + break + fi + sleep "${wait_time}" || die + done + if [ ${#pending_containers[@]} -ne 0 ]; then + if [[ "${1-}" == "force" ]]; then + warn "Force killing [${pending_containers[@]}] ..." + docker rm --force ${pending_containers[@]} || die + else + die "Still few containers running!" + fi + fi +} + + +function warn () { + # Print the message to standard error. + # + # Arguments: + # - ${@} - The text of the message. + + echo "$@" >&2 +} + + +set_eligibility_off || die +wait_for_pending_containers "${@}" || die +restart_vfs_service || die +set_eligibility_on || die diff --git a/resources/tools/testbed-setup/ansible/roles/cleanup/tasks/kill_containers.yaml b/resources/tools/testbed-setup/ansible/roles/cleanup/tasks/kill_containers.yaml new file mode 100644 index 0000000000..a61aa6ceee --- /dev/null +++ b/resources/tools/testbed-setup/ansible/roles/cleanup/tasks/kill_containers.yaml @@ -0,0 +1,28 @@ +--- +# file: roles/cleanup/tasks/kill_containers.yaml + +- name: Kill container - Get running Docker containers + shell: "docker ps -aq" + register: running_containers + changed_when: no + tags: kill-containers + +- name: Kill container - Remove all Docker containers + docker_container: + name: "{{ item }}" + state: absent + with_items: "{{ running_containers.stdout_lines }}" + tags: kill-containers + +- name: Kill container - Get running LXC containers + shell: "lxc-ls" + register: running_containers + changed_when: no + tags: kill-containers + +- name: Kill container - Remove all LXC containers + lxc_container: + name: '{{ item }}' + state: absent + with_items: "{{ running_containers.stdout_lines }}" + tags: kill-containers diff --git a/resources/tools/testbed-setup/ansible/roles/cleanup/tasks/kill_process.yaml b/resources/tools/testbed-setup/ansible/roles/cleanup/tasks/kill_process.yaml new file mode 100644 index 0000000000..4a1180b77f --- /dev/null +++ b/resources/tools/testbed-setup/ansible/roles/cleanup/tasks/kill_process.yaml @@ -0,0 +1,27 @@ +--- +# file: roles/cleanup/tasks/kill_process.yaml + +- name: Kill process - Get pid of {{ process }} + shell: "ps -ef | grep -v grep | grep -w {{ process }} | awk '{print $2}'" + when: > + process is defined and process != "" + register: running_processes + tags: kill-process + +- name: Kill process - Safe kill {{ process }} + shell: "kill {{ item }}" + with_items: "{{ running_processes.stdout_lines }}" + tags: kill-process + +- wait_for: + path: "/proc/{{ item }}/status" + state: absent + with_items: "{{ running_processes.stdout_lines }}" + ignore_errors: yes + register: killed_processes + tags: kill-process + +- name: Kill process - Force kill {{ process }} + shell: "kill -9 {{ item }}" + with_items: "{{ killed_processes.results | select('failed') | map(attribute='item') | list }}" + tags: kill-process diff --git a/resources/tools/testbed-setup/ansible/roles/cleanup/tasks/main.yaml b/resources/tools/testbed-setup/ansible/roles/cleanup/tasks/main.yaml new file mode 100644 index 0000000000..64a55c4672 --- /dev/null +++ b/resources/tools/testbed-setup/ansible/roles/cleanup/tasks/main.yaml @@ -0,0 +1,31 @@ +--- +# file: roles/cleanup/tasks/main.yaml +# purpose: Structured per server cleanup tasks. +# - main: +# - tg: +# - Run tasks on TG servers only. +# - Cleanup processes (T-Rex). +# - sut: +# - Run tasks on SUT servers only. +# - Cleanup file leftovers (logs). +# - Cleanup packages (VPP, Honeycomb). +# - Cleanup processes (qemu, l3fwd, testpmd, docker, kubernetes) +# - Cleanup interfaces. +# - vpp_device +# - Run tasks on vpp_device servers only. +# - Reset SRIOV + +- name: tg specific + include_tasks: tg.yaml + when: "'tg' in group_names" + tags: cleanup + +- name: sut specific + include_tasks: sut.yaml + when: "'sut' in group_names" + tags: cleanup + +- name: vpp_device specific + include_tasks: vpp_device.yaml + when: "'vpp_device' in group_names" + tags: cleanup diff --git a/resources/tools/testbed-setup/ansible/roles/cleanup/tasks/remove_package.yaml b/resources/tools/testbed-setup/ansible/roles/cleanup/tasks/remove_package.yaml new file mode 100644 index 0000000000..8f5ec8fefe --- /dev/null +++ b/resources/tools/testbed-setup/ansible/roles/cleanup/tasks/remove_package.yaml @@ -0,0 +1,31 @@ +--- +# file: roles/cleanup/tasks/remove_package.yaml + +- name: Remove package - Fix corrupted apt + shell: 'dpkg --configure -a' + when: > + ansible_distribution == 'Ubuntu' + tags: remove-package + +- name: Remove package - Check if {{ package }} is installed + shell: > + "dpkg-query -W -f='${Status}' {{ package }} | grep 'install ok installed'" + register: package_is_installed + failed_when: no + changed_when: no + when: > + ansible_distribution == 'Ubuntu' + tags: remove-package + +- name: Remove package - {{ package }} + apt: + name: '{{ package }}' + force: yes + purge: yes + state: absent + when: > + package is defined and + package != '' and + package_is_installed.rc == 0 and + ansible_distribution|lower == 'ubuntu' + tags: remove-package diff --git a/resources/tools/testbed-setup/ansible/roles/cleanup/tasks/sut.yaml b/resources/tools/testbed-setup/ansible/roles/cleanup/tasks/sut.yaml new file mode 100644 index 0000000000..5083a96a29 --- /dev/null +++ b/resources/tools/testbed-setup/ansible/roles/cleanup/tasks/sut.yaml @@ -0,0 +1,52 @@ +--- +# file: roles/cleanup/tasks/sut.yaml + +- name: Kill processes - qemu + import_tasks: kill_process.yaml + vars: + process: "qemu" + tags: kill-process + +- name: Kill processes - l3fwd + import_tasks: kill_process.yaml + vars: + process: "l3fwd" + tags: kill-process + +- name: Kill processes - testpmd + import_tasks: kill_process.yaml + vars: + process: "testpmd" + tags: kill-process + +- name: Remove file or dir - HoneyComb logs + file: + state: absent + path: "/var/log/honeycomb" + tags: remove-file-dir + +- name: Remove file or dir - Core zip file + file: + state: absent + path: "/tmp/*tar.lzo.lrz.xz*" + tags: remove-file-dir + +- name: Remove file or dir - Core dump file + file: + state: absent + path: "/tmp/*core*" + tags: remove-file-dir + +- name: Kill containers - Remove all containers + import_tasks: kill_containers.yaml + tags: kill-containers + +- name: Kubernetes - Reset + raw: 'kubeadm reset --force' + tags: kill-kubernetes + +- name: Remove packages - Remove VPP + import_tasks: remove_package.yaml + vars: + package: "*vpp*" + tags: remove-package diff --git a/resources/tools/testbed-setup/ansible/roles/cleanup/tasks/tg.yaml b/resources/tools/testbed-setup/ansible/roles/cleanup/tasks/tg.yaml new file mode 100644 index 0000000000..f58cb59a1a --- /dev/null +++ b/resources/tools/testbed-setup/ansible/roles/cleanup/tasks/tg.yaml @@ -0,0 +1,14 @@ +--- +# file: roles/cleanup/tasks/tg.yaml + +- name: Kill processes - TRex + import_tasks: kill_process.yaml + vars: + process: "_t-rex" + tags: kill-process + +- name: Kill processes - WRK + import_tasks: kill_process.yaml + vars: + process: "wrk" + tags: kill-process diff --git a/resources/tools/testbed-setup/ansible/roles/cleanup/tasks/vpp_device.yaml b/resources/tools/testbed-setup/ansible/roles/cleanup/tasks/vpp_device.yaml new file mode 100644 index 0000000000..5b7713a554 --- /dev/null +++ b/resources/tools/testbed-setup/ansible/roles/cleanup/tasks/vpp_device.yaml @@ -0,0 +1,15 @@ +--- +# file: roles/cleanup/tasks/vpp_device.yaml + +- name: Reset vpp_device binary + template: + src: 'files/reset_vppdevice.sh' + dest: '/usr/local/bin' + owner: 'root' + group: 'root' + mode: '644' + tags: reset-sriov + +- name: Reset vpp_device + raw: 'reset_vppdevice.sh --force' + tags: reset-sriov -- cgit 1.2.3-korg