diff options
Diffstat (limited to 'fdio.infra.ansible/roles/cleanup')
10 files changed, 442 insertions, 0 deletions
diff --git a/fdio.infra.ansible/roles/cleanup/files/reset_vppdevice.sh b/fdio.infra.ansible/roles/cleanup/files/reset_vppdevice.sh new file mode 100644 index 0000000000..ede2db1273 --- /dev/null +++ b/fdio.infra.ansible/roles/cleanup/files/reset_vppdevice.sh @@ -0,0 +1,113 @@ +#!/usr/bin/env bash + +set -euo pipefail + +function die () { + # Print the message to standard error end exit with error code specified + # by the second argument. + # + # Hardcoded values: + # - The default error message. + # Arguments: + # - ${1} - The whole error message, be sure to quote. Optional + # - ${2} - the code to exit with, default: 1. + + set +eu + warn "${1:-Unspecified run-time error occurred!}" + exit "${2:-1}" +} + + +function set_eligibility_off { + # Set Nomad eligibility to ineligible for scheduling. Fail otherwise. + + set -euo pipefail + + node_id="$(nomad node status | grep $(hostname) | cut -d ' ' -f 1)" || die + node_status="$(nomad node status | grep $(hostname))" || die + + if [[ "${node_status}" != *"ineligible"* ]]; then + nomad node eligibility -disable "${node_id}" || die + node_status="$(nomad node status | grep $(hostname))" || die + if [[ "${node_status}" != *"ineligible"* ]]; then + die "Set eligibility off failed!" + fi + fi +} + + +function set_eligibility_on { + # Set Nomad eligibility to eligible for scheduling. Fail otherwise. + + set -euo pipefail + + node_id="$(nomad node status | grep $(hostname) | cut -d ' ' -f 1)" || die + node_status="$(nomad node status | grep $(hostname))" || die + + if [[ "${node_status}" == *"ineligible"* ]]; then + nomad node eligibility -enable "${node_id}" || die + node_status="$(nomad node status | grep $(hostname))" || die + if [[ "${node_status}" == *"ineligible"* ]]; then + die "Set eligibility on failed!" + fi + fi +} + + +function restart_vfs_service { + # Stop and start VF serice. This will reinitialize VFs and driver mappings. + + set -euo pipefail + + warn "Restarting VFs service (this may take few minutes)..." + sudo service csit-initialize-vfs stop || die "Failed to stop VFs service!" + sudo service csit-initialize-vfs start || die "Failed to start VFs service!" +} + + +function wait_for_pending_containers { + # Wait in loop for defined amount of time for pending containers to + # gracefully quit them. If parameter force is specified. Force kill them. + + # Arguments: + # - ${@} - Script parameters. + + set -euo pipefail + + retries=60 + wait_time=60 + containers=(docker ps --quiet --filter name=csit*) + + for i in $(seq 1 ${retries}); do + mapfile -t pending_containers < <( ${containers[@]} ) || die + warn "Waiting for pending containers [${pending_containers[@]}] ..." + if [ ${#pending_containers[@]} -eq 0 ]; then + break + fi + sleep "${wait_time}" || die + done + if [ ${#pending_containers[@]} -ne 0 ]; then + if [[ "${1-}" == "force" ]]; then + warn "Force killing [${pending_containers[@]}] ..." + docker rm --force ${pending_containers[@]} || die + else + die "Still few containers running!" + fi + fi +} + + +function warn () { + # Print the message to standard error. + # + # Arguments: + # - ${@} - The text of the message. + + echo "$@" >&2 +} + + +set_eligibility_off || die +wait_for_pending_containers "${@}" || die +restart_vfs_service || die +set_eligibility_on || die diff --git a/fdio.infra.ansible/roles/cleanup/tasks/clean_images.yaml b/fdio.infra.ansible/roles/cleanup/tasks/clean_images.yaml new file mode 100644 index 0000000000..e030acbff2 --- /dev/null +++ b/fdio.infra.ansible/roles/cleanup/tasks/clean_images.yaml @@ -0,0 +1,36 @@ +--- +# file: roles/cleanup/tasks/clean_images.yaml + +- name: Clean Docker Images + block: + - name: Clean Images - Prefetch Docker Images + cron: + name: "Prefetch docker image {{ item }}" + minute: "10" + hour: "7" + job: "/usr/bin/docker pull {{ item }}" + loop: + "{{ images_to_prefetch_by_arch[ansible_machine] }}" + tags: + - prefetch-docker-images + + - name: Clean Images - Remove Dangling Docker Images + cron: + name: "Remove dangling docker images" + minute: "10" + hour: "5" + weekday: "7" + job: "/usr/bin/docker rmi $(/usr/bin/docker images --filter 'dangling=true' -q)" + tags: + - remove-docker-images-dangling + + # TODO: Disabled until all images will be in registry + #- name: Clean Images - Prune Docker Images + # cron: + # name: "Prune docker images" + # minute: "10" + # hour: "6" + # weekday: 7 + # job: "/usr/bin/docker image prune --all --force" + # tags: + # - prune-docker-images
\ No newline at end of file diff --git a/fdio.infra.ansible/roles/cleanup/tasks/kill_containers.yaml b/fdio.infra.ansible/roles/cleanup/tasks/kill_containers.yaml new file mode 100644 index 0000000000..25fd48e420 --- /dev/null +++ b/fdio.infra.ansible/roles/cleanup/tasks/kill_containers.yaml @@ -0,0 +1,42 @@ +--- +# file: roles/cleanup/tasks/kill_containers.yaml + +- name: Kill Docker Containers + block: + - name: Kill Container - Get Running Docker Containers + shell: "docker ps -aq" + register: running_containers + changed_when: no + tags: + - kill-containers + + - name: Kill Container - Remove All Docker Containers + shell: "docker rm --force {{ item }}" + with_items: "{{ running_containers.stdout_lines }}" + tags: + - kill-containers + + rescue: + - name: Restart Docker Daemon + systemd: + name: "docker" + state: "restarted" + +- name: Kill LXC Containers + block: + - name: Kill Container - Get Running LXC Containers + shell: "lxc-ls" + register: running_containers + changed_when: no + tags: + - kill-containers + + - name: Kill Container - Remove All LXC Containers + shell: "lxc-destroy --force -n {{ item }}" + with_items: "{{ running_containers.stdout_lines }}" + tags: + - kill-containers + + rescue: + - fail: + msg: "Kill LXC containers failed!"
\ No newline at end of file diff --git a/fdio.infra.ansible/roles/cleanup/tasks/kill_process.yaml b/fdio.infra.ansible/roles/cleanup/tasks/kill_process.yaml new file mode 100644 index 0000000000..c7cee37485 --- /dev/null +++ b/fdio.infra.ansible/roles/cleanup/tasks/kill_process.yaml @@ -0,0 +1,37 @@ +--- +# file: roles/cleanup/tasks/kill_process.yaml + +- name: Kill Process - {{ process }} + block: + - name: Get PID Of {{ process }} + shell: "ps -ef | grep -v grep | grep -w {{ process }} | awk '{print $2}'" + when: + - process is defined and process != "" + register: running_processes + tags: + - kill-process + + - name: Safe Kill {{ process }} + shell: "kill {{ item }}" + with_items: "{{ running_processes.stdout_lines }}" + tags: + - kill-process + + - wait_for: + path: "/proc/{{ item }}/status" + state: "absent" + with_items: "{{ running_processes.stdout_lines }}" + ignore_errors: yes + register: killed_processes + tags: + - kill-process + + - name: Kill Process - Force Kill {{ process }} + shell: "kill -9 {{ item }}" + with_items: "{{ killed_processes.results | select('failed') | map(attribute='item') | list }}" + tags: + - kill-process + + rescue: + - fail: + msg: "Kill process {{ process }} failed!" diff --git a/fdio.infra.ansible/roles/cleanup/tasks/main.yaml b/fdio.infra.ansible/roles/cleanup/tasks/main.yaml new file mode 100644 index 0000000000..eeda0139b3 --- /dev/null +++ b/fdio.infra.ansible/roles/cleanup/tasks/main.yaml @@ -0,0 +1,43 @@ +--- +# file: roles/cleanup/tasks/main.yaml +# purpose: Structured per server cleanup tasks. +# - main: +# - tg: +# - Run tasks on TG servers only. +# - Cleanup processes (T-Rex). +# - sut: +# - Run tasks on SUT servers only. +# - Cleanup file leftovers (logs). +# - Cleanup packages (VPP, Honeycomb). +# - Cleanup processes (qemu, l3fwd, testpmd, docker, kubernetes) +# - Cleanup interfaces. +# - vpp_device +# - Run tasks on vpp_device servers only. +# - Reset SRIOV +# - Docker image cleanup +# - nomad +# - Docker image cleanup + +- name: tg specific + include_tasks: tg.yaml + when: "'tg' in group_names" + tags: + - cleanup + +- name: sut specific + include_tasks: sut.yaml + when: "'sut' in group_names" + tags: + - cleanup + +- name: vpp_device specific + include_tasks: vpp_device.yaml + when: "'vpp_device' in group_names" + tags: + - cleanup + +- name: nomad specific + include_tasks: nomad.yaml + when: "'nomad' in group_names" + tags: + - cleanup diff --git a/fdio.infra.ansible/roles/cleanup/tasks/nomad.yaml b/fdio.infra.ansible/roles/cleanup/tasks/nomad.yaml new file mode 100644 index 0000000000..3c5bf6462d --- /dev/null +++ b/fdio.infra.ansible/roles/cleanup/tasks/nomad.yaml @@ -0,0 +1,22 @@ +--- +# file: roles/cleanup/tasks/nomad.yaml + +- name: Host Cleanup + block: + - name: Clean Images + import_tasks: clean_images.yaml + vars: + images_to_prefetch_by_arch: + aarch64: + - "fdiotools/builder-ubuntu2004:prod-aarch64" + - "fdiotools/builder-ubuntu1804:prod-aarch64" + - "fdiotools/builder-centos8:prod-aarch64" + x86_64: + - "fdiotools/builder-ubuntu2004:prod-x86_64" + - "fdiotools/builder-ubuntu1804:prod-x86_64" + - "fdiotools/builder-debian10:prod-x86_64" + - "fdiotools/builder-debian9:prod-x86_64" + - "fdiotools/builder-centos8:prod-x86_64" + - "fdiotools/builder-centos7:prod-x86_64" + tags: + - clean-images
\ No newline at end of file diff --git a/fdio.infra.ansible/roles/cleanup/tasks/remove_package.yaml b/fdio.infra.ansible/roles/cleanup/tasks/remove_package.yaml new file mode 100644 index 0000000000..302b43c99a --- /dev/null +++ b/fdio.infra.ansible/roles/cleanup/tasks/remove_package.yaml @@ -0,0 +1,21 @@ +--- +# file: roles/cleanup/tasks/remove_package.yaml + +- name: Remove Package - Fix Corrupted APT + shell: "dpkg --configure -a" + when: + - ansible_distribution == 'Ubuntu' + tags: + - remove-package + +- name: Remove Package - {{ package }} + apt: + name: "{{ package }}" + force: yes + purge: yes + state: "absent" + failed_when: no + when: + - ansible_distribution == 'Ubuntu' + tags: + - remove-package diff --git a/fdio.infra.ansible/roles/cleanup/tasks/sut.yaml b/fdio.infra.ansible/roles/cleanup/tasks/sut.yaml new file mode 100644 index 0000000000..d80a35b1cb --- /dev/null +++ b/fdio.infra.ansible/roles/cleanup/tasks/sut.yaml @@ -0,0 +1,83 @@ +--- +# file: roles/cleanup/tasks/sut.yaml + +- name: Host Cleanup + block: + - name: Kill Processes - Qemu + import_tasks: kill_process.yaml + vars: + process: "qemu" + tags: + - kill-process + + - name: Kill Processes - L3fwd + import_tasks: kill_process.yaml + vars: + process: "l3fwd" + tags: + - kill-process + + - name: Kill Processes - Testpmd + import_tasks: kill_process.yaml + vars: + process: "testpmd" + tags: + - kill-process + + - name: Kill Processes - iPerf3 + import_tasks: kill_process.yaml + vars: + process: "iperf3" + tags: + - kill-process + + - name: Kill Processes - vpp_echo + import_tasks: kill_process.yaml + vars: + process: "vpp_echo" + tags: + - kill-process + + - name: Find File Or Dir - Core Zip File + find: + paths: "/tmp/" + patterns: "*tar.lzo.lrz.xz*" + register: files_to_delete + tags: + - remove-file-dir + + - name: Remove File Or Dir - Core Zip File + file: + path: "{{ item.path }}" + state: absent + with_items: "{{ files_to_delete.files }}" + tags: + - remove-file-dir + + - name: Find File Or Dir - Core Dump File + find: + paths: "/tmp/" + patterns: "*core*" + register: files_to_delete + tags: + - remove-file-dir + + - name: Remove File Or Dir - Core Dump File + file: + path: "{{ item.path }}" + state: absent + with_items: "{{ files_to_delete.files }}" + tags: + - remove-file-dir + + - name: Kill Containers - Remove All Containers + import_tasks: kill_containers.yaml + tags: + - kill-containers + + - name: Remove Packages - Remove VPP + import_tasks: remove_package.yaml + vars: + package: "*vpp*" + tags: + - remove-package diff --git a/fdio.infra.ansible/roles/cleanup/tasks/tg.yaml b/fdio.infra.ansible/roles/cleanup/tasks/tg.yaml new file mode 100644 index 0000000000..fa2d2d2819 --- /dev/null +++ b/fdio.infra.ansible/roles/cleanup/tasks/tg.yaml @@ -0,0 +1,13 @@ +--- +# file: roles/cleanup/tasks/tg.yaml + +- name: Host Cleanup + block: + - name: Kill Processes - TRex + import_tasks: kill_process.yaml + vars: + process: "_t-rex" + when: + - docker_tg is undefined + tags: + - kill-process diff --git a/fdio.infra.ansible/roles/cleanup/tasks/vpp_device.yaml b/fdio.infra.ansible/roles/cleanup/tasks/vpp_device.yaml new file mode 100644 index 0000000000..41c4b29d37 --- /dev/null +++ b/fdio.infra.ansible/roles/cleanup/tasks/vpp_device.yaml @@ -0,0 +1,32 @@ +--- +# file: roles/cleanup/tasks/vpp_device.yaml + +- name: Host Cleanup + block: + - name: Reset vpp_device Binary + copy: + src: "files/reset_vppdevice.sh" + dest: "/usr/local/bin" + owner: "root" + group: "root" + mode: "744" + tags: + - reset-sriov + + - name: Clean Images + import_tasks: clean_images.yaml + vars: + images_to_prefetch_by_arch: + aarch64: + - "fdiotools/builder-ubuntu2004:prod-aarch64" + - "fdiotools/builder-ubuntu1804:prod-aarch64" + - "fdiotools/builder-centos8:prod-aarch64" + x86_64: + - "fdiotools/builder-ubuntu2004:prod-x86_64" + - "fdiotools/builder-ubuntu1804:prod-x86_64" + - "fdiotools/builder-debian10:prod-x86_64" + - "fdiotools/builder-debian9:prod-x86_64" + - "fdiotools/builder-centos8:prod-x86_64" + - "fdiotools/builder-centos7:prod-x86_64" + tags: + - clean-images
\ No newline at end of file |