From 09bc6f0a9901286278b04b50a9063ffefc427839 Mon Sep 17 00:00:00 2001 From: Sofer Athlan-Guyot Date: Thu, 5 Dec 2024 05:09:58 +0100 Subject: [PATCH] Add the steps to reboot the computes after update. This sequences implement reboot of the compute nodes after the update. If one or more instances have been created they will be live-migrated to others instance before the reboot and migrated back to the original hypervisor after the reboot. Some basic sanity checks are performed after the reboot and before the migration back to ensure that the necessary services are up and running. Closes: https://issues.redhat.com/browse/OSPRH-8937 --- roles/update/tasks/main.yml | 3 + roles/update/tasks/reboot_compute.yml | 26 +++++++ roles/update/tasks/reboot_hypervisor.yml | 74 +++++++++++++++++++ .../tasks/reboot_hypervisor_sanity_checks.yml | 33 +++++++++ 4 files changed, 136 insertions(+) create mode 100644 roles/update/tasks/reboot_compute.yml create mode 100644 roles/update/tasks/reboot_hypervisor.yml create mode 100644 roles/update/tasks/reboot_hypervisor_sanity_checks.yml diff --git a/roles/update/tasks/main.yml b/roles/update/tasks/main.yml index 6fbd74e4a3..b13bfee9e8 100644 --- a/roles/update/tasks/main.yml +++ b/roles/update/tasks/main.yml @@ -74,3 +74,6 @@ - not cifmw_update_run_dryrun | bool ansible.builtin.shell: | {{ cifmw_update_artifacts_basedir }}/control_plane_test_stop.sh + +- name: Reboot the compute nodes + ansible.builtin.include_tasks: reboot_compute.yml diff --git a/roles/update/tasks/reboot_compute.yml b/roles/update/tasks/reboot_compute.yml new file mode 100644 index 0000000000..8a3d2f44ce --- /dev/null +++ b/roles/update/tasks/reboot_compute.yml @@ -0,0 +1,26 @@ +- name: Define command for OpenStack client interactions + ansible.builtin.set_fact: + openstack_cmd: "oc rsh -n openstack openstackclient openstack" + bash_cmd: "oc rsh -n openstack openstackclient bash -c" + +- name: Register storage backend type + shell: >- + {{ openstack_cmd }} volume service list -f json | + jq -r -c '.[] | select(.Binary | contains("cinder-volume")) | .Host' + register: storage_backend + +- name: Get list of OpenStack hypervisors + ansible.builtin.shell: | + {{ openstack_cmd }} hypervisor list -f json + register: hypervisor_list + changed_when: false + +- name: Parse the hypervisor list to extract hostnames + ansible.builtin.set_fact: + hypervisor_hostnames: "{{ hypervisor_list.stdout | from_json | map(attribute='Hypervisor Hostname') | list }}" + +- name: Iterate over each hypervisor + ansible.builtin.include_tasks: reboot_hypervisor.yml + loop: "{{ hypervisor_hostnames }}" + loop_control: + loop_var: hypervisor diff --git a/roles/update/tasks/reboot_hypervisor.yml b/roles/update/tasks/reboot_hypervisor.yml new file mode 100644 index 0000000000..920e1fcdd7 --- /dev/null +++ b/roles/update/tasks/reboot_hypervisor.yml @@ -0,0 +1,74 @@ +--- +- name: Extract short hostname from FQDN + ansible.builtin.set_fact: + hypervisor_short_name: "{{ hypervisor.split('.')[0] }}" + +- debug: + msg: "Rebooting {{ hypervisor_short_name }}" + +- name: Check active VMs on hypervisor + ansible.builtin.shell: >- + {{ openstack_cmd }} server list --all --host {{ hypervisor }} -f json + | jq -r -c '.[] | select(.Status | contains("ACTIVE") or contains("PAUSED")) | .ID' + register: active_vms + changed_when: false + +- name: Evacuate VMs if they are running + ansible.builtin.shell: >- + {{ bash_cmd }} ". cloudrc && + nova host-evacuate-live + {% if 'ceph' not in storage_backend.stdout %} + --block-migrate + {% endif %} + {{ hypervisor }}" + when: active_vms.stdout != '' + changed_when: true + +- name: Wait for compute node to get quiesced + ansible.builtin.shell: >- + {{ openstack_cmd }} server list --all --host {{ hypervisor }} -f json + | jq -r -c '[.[] | select(.Status | + contains("ACTIVE") or contains("PAUSED") or contains("MIGRATING"))] + | length' + + register: compute_node_instances + until: compute_node_instances.stdout.find("0") > -1 + retries: 30 + delay: 5 + when: + - active_vms.stdout != '' + +- name: Reboot the hypervisor + ansible.builtin.reboot: + reboot_timeout: 1200 + test_command: "systemctl is-system-running | grep -e running -e degraded" + delegate_to: "{{ hypervisor_short_name }}" + become: true + +- name: Perform sanity checks post-reboot + ansible.builtin.include_tasks: reboot_hypervisor_sanity_checks.yml + vars: + current_hypervisor: "{{ hypervisor }}" + +- debug: + msg: "Migrate back {{ item }} to {{ hypervisor_short_name }}." + with_items: "{{ active_vms.stdout_lines }}" + +- name: Migrate back VMs post-reboot + ansible.builtin.shell: >- + set -o pipefail; + {{ bash_cmd }} ". cloudrc && + nova live-migration + {% if 'ceph' not in storage_backend.stdout %} + --block-migrate + {% endif %} + {{ item }} {{ hypervisor }}"; + {{ openstack_cmd }} server show {{ item }} -f json | + jq -r -c '. | .["OS-EXT-SRV-ATTR:host"]' + register: instance_migration_result + until: instance_migration_result.stdout.find(hypervisor) > -1 + retries: 30 + delay: 5 + with_items: "{{ active_vms.stdout_lines }}" + when: + - active_vms.stdout != '' diff --git a/roles/update/tasks/reboot_hypervisor_sanity_checks.yml b/roles/update/tasks/reboot_hypervisor_sanity_checks.yml new file mode 100644 index 0000000000..cb86d8f7ce --- /dev/null +++ b/roles/update/tasks/reboot_hypervisor_sanity_checks.yml @@ -0,0 +1,33 @@ +--- +- ansible.builtin.debug: + msg: "Here I'm testing the reboot for {{ current_hypervisor }}." + +- name: Verify nova-compute service + ansible.builtin.shell: >- + {{ openstack_cmd }} compute service list --host {{ current_hypervisor }} -f json + | jq -r -c '.[] + | select(.Binary | contains("nova-compute")) | .State' + register: nova_compute_status + until: nova_compute_status.stdout == 'up' + retries: 5 + delay: 30 + +- name: Verify ovn-controller service + ansible.builtin.shell: >- + {{ openstack_cmd }} network agent list --host {{ current_hypervisor }} -f json + | jq -r -c '.[] + | select(.Binary | contains("ovn-controller")) | .Alive' + register: ovn_controller_status + until: ovn_controller_status.stdout == 'true' + retries: 5 + delay: 30 + +- name: Verify networking-ovn-metadata-agent + ansible.builtin.shell: >- + {{ openstack_cmd }} network agent list --host {{ current_hypervisor }} -f json + | jq -r -c '.[] + | select(.Binary | contains("neutron-ovn-metadata-agent")) | .Alive' + register: networking_ovn_metadata_status + until: networking_ovn_metadata_status.stdout == 'true' + retries: 5 + delay: 30