diff --git a/roles/update/README.md b/roles/update/README.md index 73fac13004..e799e3a602 100644 --- a/roles/update/README.md +++ b/roles/update/README.md @@ -13,5 +13,7 @@ Role to run update * `cifmw_update_ping_loss_percent` : (Integer) Maximum percentage of ping loss accepted. Default to `0`. Only relevant when `cifmw_update_ping_loss_second` is not 0. * `cifmw_update_control_plane_check`: (Boolean) Activate a continuous control plane testing. Default to `False` * `cifmw_update_openstackclient_pod_timeout`: (Integer) Maximum number of seconds to wait for the openstackclient Pod to be available during control plane testing, as it is being restarted during update. Default to `10` seconds. - +* `cifmw_update_reboot_test`: (Boolean) Activate the reboot test after update. Default to `True`. +* `cifmw_update_ansible_ssh_private_key_file`: (String) Define the path to the private key file used for the compute nodes. +* `cifmw_update_wait_retries_reboot`: (Integer) Number of retries to wait for a compute node reboot. One retry is done every five seconds. Default to 60, so five minutes. ## Examples diff --git a/roles/update/defaults/main.yml b/roles/update/defaults/main.yml index 527c6361f2..3633c5d0d7 100644 --- a/roles/update/defaults/main.yml +++ b/roles/update/defaults/main.yml @@ -37,7 +37,15 @@ cifmw_update_timestamper_cmd: >- cifmw_update_ping_start_script: "{{ cifmw_update_artifacts_basedir }}/l3_agent_start_ping.sh" cifmw_update_ping_stop_script: "{{ cifmw_update_artifacts_basedir }}/l3_agent_stop_ping.sh" +# Operation in the openstack namespace +cifmw_update_openstack_cmd: >- + oc rsh -n {{ cifmw_update_namespace }} openstackclient openstack + ## User facing +cifmw_update_reboot_test: true +cifmw_update_ansible_ssh_private_key_file: >- + "{{ ansible_ssh_private_key_file | default(ansible_user_dir ~ '/.ssh/id_cifw') }}" +cifmw_update_wait_retries_reboot: 60 cifmw_update_ping_test: false cifmw_update_create_volume: false diff --git a/roles/update/molecule/default/prepare.yml b/roles/update/molecule/default/prepare.yml index 9360e433f5..7899e26c1f 100644 --- a/roles/update/molecule/default/prepare.yml +++ b/roles/update/molecule/default/prepare.yml @@ -28,7 +28,8 @@ - role: ci_setup - role: install_yamls tasks: - - name: Set custom cifmw PATH reusable fact + - name: Set custom some reusable facts ansible.builtin.set_fact: cifmw_path: "{{ ansible_user_dir }}/.crc/bin:{{ ansible_user_dir }}/.crc/bin/oc:{{ ansible_user_dir }}/bin:{{ ansible_env.PATH }}" + cifmw_update_reboot_test: false cacheable: true diff --git a/roles/update/tasks/main.yml b/roles/update/tasks/main.yml index 6fbd74e4a3..b3f0f4120c 100644 --- a/roles/update/tasks/main.yml +++ b/roles/update/tasks/main.yml @@ -74,3 +74,8 @@ - not cifmw_update_run_dryrun | bool ansible.builtin.shell: | {{ cifmw_update_artifacts_basedir }}/control_plane_test_stop.sh + +- name: Reboot the compute nodes + ansible.builtin.include_tasks: reboot_computes.yml + when: + - cifmw_update_reboot_test | bool diff --git a/roles/update/tasks/reboot_computes.yml b/roles/update/tasks/reboot_computes.yml new file mode 100644 index 0000000000..f0dc1d8601 --- /dev/null +++ b/roles/update/tasks/reboot_computes.yml @@ -0,0 +1,72 @@ +- name: Define command for OpenStack client interactions +- name: Register storage backend type + environment: + KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}" + PATH: "{{ cifmw_path | default(ansible_env.PATH) }}" + ansible.builtin.shell: >- + set -o pipefail; + {{ cifmw_update_openstack_cmd }} volume service list -f json | + jq -r -c '.[] | select(.Binary | contains("cinder-volume")) | .Host' + register: storage_backend + changed_when: false + +- name: Get the list of OpenStack hypervisors + environment: + KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}" + PATH: "{{ cifmw_path | default(ansible_env.PATH) }}" + ansible.builtin.shell: | + {{ cifmw_update_openstack_cmd }} hypervisor list -f json + register: hypervisor_list + changed_when: false + +- name: Parse the hypervisor list to extract hostnames + ansible.builtin.set_fact: + hypervisor_hostnames: "{{ hypervisor_list.stdout | from_json | map(attribute='Hypervisor Hostname') | list }}" + +- name: Create a reboot monitor servers script + ansible.builtin.template: + src: "monitor_servers.sh.j2" + dest: "{{ cifmw_update_artifacts_basedir }}/monitor_servers.sh" + mode: "0775" + +- name: Start the monitor servers script + ansible.builtin.shell: | + nohup {{ cifmw_update_artifacts_basedir }}/monitor_servers.sh &> /dev/null & + echo $! + register: monitor_servers_job + +- name: Create a monitor placement monitor script + ansible.builtin.template: + src: "monitor_vm_placement.sh.j2" + dest: "{{ cifmw_update_artifacts_basedir }}/monitor_vm_placement.sh" + mode: "0775" + +- name: Start the monitor placement script + ansible.builtin.shell: | + nohup {{ cifmw_update_artifacts_basedir }}/monitor_vm_placement.sh &> /dev/null & + echo $! + register: monitor_placement_job + +- name: Iterate over each hypervisor for the reboot sequence + ansible.builtin.include_tasks: reboot_hypervisor.yml + loop: "{{ hypervisor_hostnames }}" + loop_control: + loop_var: hypervisor + +- name: Stop the monitor servers script if running + ansible.builtin.shell: | + if kill -0 {{ monitor_servers_job.stdout }} &>/dev/null; then + kill {{ monitor_servers_job.stdout }} + fi + register: kill_result + failed_when: kill_result.rc not in [0, 1] # We can still have a race + # between kill -0 and + # kill, even if unlikely. + +- name: Stop the monitor placement script if running + ansible.builtin.shell: | + if kill -0 {{ monitor_placement_job.stdout }} &>/dev/null; then + kill {{ monitor_placement_job.stdout }} + fi + register: kill_result + failed_when: kill_result.rc not in [0, 1] diff --git a/roles/update/tasks/reboot_hypervisor.yml b/roles/update/tasks/reboot_hypervisor.yml new file mode 100644 index 0000000000..c3f02f48f1 --- /dev/null +++ b/roles/update/tasks/reboot_hypervisor.yml @@ -0,0 +1,90 @@ +--- +- name: Extract short hostname from FQDN + ansible.builtin.set_fact: + cifmw_update_hypervisor_short_name: "{{ hypervisor.split('.')[0] }}" + +- name: Display current stage + ansible.builtin.debug: + msg: "Rebooting {{ cifmw_update_hypervisor_short_name }}" + +- name: Define command for nova interaction + ansible.builtin.set_fact: + cifmw_update_bash_cmd: >- + oc rsh -n {{ cifmw_update_namespace }} openstackclient bash -c + +- name: Check active VMs on hypervisor + environment: + KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}" + PATH: "{{ cifmw_path | default(ansible_env.PATH) }}" + ansible.builtin.shell: >- + set -o pipefail; + {{ cifmw_update_openstack_cmd }} server list --all --host {{ hypervisor }} -f json + | jq -r -c '.[] | select(.Status | contains("ACTIVE") or contains("PAUSED")) | .ID' + register: active_vms + changed_when: false + +- name: Evacuate VMs if they are running + environment: + KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}" + PATH: "{{ cifmw_path | default(ansible_env.PATH) }}" + ansible.builtin.shell: >- + {{ cifmw_update_bash_cmd }} ". cloudrc && + nova host-evacuate-live + {% if 'ceph' not in storage_backend.stdout %} + --block-migrate + {% endif %} + {{ hypervisor }}" + when: active_vms.stdout != '' + changed_when: true + +- name: Wait for compute node to get quiesced + environment: + KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}" + PATH: "{{ cifmw_path | default(ansible_env.PATH) }}" + ansible.builtin.shell: >- + set -o pipefail; + {{ cifmw_update_openstack_cmd }} server list --all --host {{ hypervisor }} -f json + | jq -r -c '[.[] | select(.Status | + contains("ACTIVE") or contains("PAUSED") or contains("MIGRATING"))] + | length' + register: compute_node_instances + until: compute_node_instances.stdout.find("0") > -1 + retries: 30 + delay: 5 + when: + - active_vms.stdout != '' + +- name: Reboot the hypervisors using CR + ansible.builtin.include_tasks: reboot_hypervisor_using_cr.yml + +- name: Perform sanity checks post-reboot + ansible.builtin.include_tasks: reboot_hypervisor_sanity_checks.yml + vars: + current_hypervisor: "{{ hypervisor }}" + +- name: Display current stage + ansible.builtin.debug: + msg: "Migrate back {{ item }} to {{ cifmw_update_hypervisor_short_name }}." + with_items: "{{ active_vms.stdout_lines }}" + +- name: Migrate back VMs post-reboot + environment: + KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}" + PATH: "{{ cifmw_path | default(ansible_env.PATH) }}" + ansible.builtin.shell: >- + set -o pipefail; + {{ cifmw_update_bash_cmd }} ". cloudrc && + nova live-migration + {% if 'ceph' not in storage_backend.stdout %} + --block-migrate + {% endif %} + {{ item }} {{ hypervisor }}"; + {{ cifmw_update_openstack_cmd }} server show {{ item }} -f json | + jq -r -c '. | .["OS-EXT-SRV-ATTR:host"]' + register: instance_migration_result + until: instance_migration_result.stdout.find(hypervisor) > -1 + retries: 30 + delay: 5 + with_items: "{{ active_vms.stdout_lines }}" + when: + - active_vms.stdout != '' diff --git a/roles/update/tasks/reboot_hypervisor_sanity_checks.yml b/roles/update/tasks/reboot_hypervisor_sanity_checks.yml new file mode 100644 index 0000000000..d87a865f22 --- /dev/null +++ b/roles/update/tasks/reboot_hypervisor_sanity_checks.yml @@ -0,0 +1,50 @@ +--- +- name: Display current stage + ansible.builtin.debug: + msg: | + Testing the status of the services for {{ current_hypervisor }} after reboot. + +- name: Verify nova-compute services + environment: + KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}" + PATH: "{{ cifmw_path | default(ansible_env.PATH) }}" + ansible.builtin.shell: >- + set -o pipefail; + {{ cifmw_update_openstack_cmd }} compute service list + --host {{ current_hypervisor }} -f json + | jq -r -c '.[] + | select(.Binary | contains("nova-compute")) | .State' + register: nova_compute_status + until: nova_compute_status.stdout == 'up' + retries: 30 + delay: 5 + +- name: Verify ovn-controller services + environment: + KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}" + PATH: "{{ cifmw_path | default(ansible_env.PATH) }}" + ansible.builtin.shell: >- + set -o pipefail; + {{ cifmw_update_openstack_cmd }} network agent list + --host {{ current_hypervisor }} -f json + | jq -r -c '.[] + | select(.Binary | contains("ovn-controller")) | .Alive' + register: ovn_controller_status + until: ovn_controller_status.stdout == 'true' + retries: 30 + delay: 5 + +- name: Verify networking-ovn-metadata-agent + environment: + KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}" + PATH: "{{ cifmw_path | default(ansible_env.PATH) }}" + ansible.builtin.shell: >- + set -o pipefail; + {{ cifmw_update_openstack_cmd }} network agent list + --host {{ current_hypervisor }} -f json + | jq -r -c '.[] + | select(.Binary | contains("neutron-ovn-metadata-agent")) | .Alive' + register: networking_ovn_metadata_status + until: networking_ovn_metadata_status.stdout == 'true' + retries: 30 + delay: 5 diff --git a/roles/update/tasks/reboot_hypervisor_using_cr.yml b/roles/update/tasks/reboot_hypervisor_using_cr.yml new file mode 100644 index 0000000000..7ad7e26f26 --- /dev/null +++ b/roles/update/tasks/reboot_hypervisor_using_cr.yml @@ -0,0 +1,62 @@ +--- +- name: Fetch NodeSets for the Reboot OpenStackDataPlaneDeployment + environment: + KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}" + PATH: "{{ cifmw_path | default(ansible_env.PATH) }}" + ansible.builtin.shell: >- + set -o pipefail; + oc -n {{ cifmw_update_namespace }} + get openstackdataplanenodeset -o name + | awk -F'/' '{print $2}' + register: cifmw_update_node_sets + changed_when: false + +- name: Construct the Reboot CR name + ansible.builtin.set_fact: + cifmw_reboot_dep_name: >- + {{ + 'reboot-' ~ cifmw_update_hypervisor_short_name ~ '-' ~ + lookup('pipe', 'date +%Y%m%d%H%S') + }} + +- name: Create the OpenStackDataPlaneDeployment CR used for reboot + ansible.builtin.copy: + dest: "{{ cifmw_update_artifacts_basedir }}/{{ cifmw_reboot_dep_name }}.yaml" + content: "{{ _content | to_nice_yaml }}" + vars: + _content: + apiVersion: dataplane.openstack.org/v1beta1 + kind: OpenStackDataPlaneDeployment + metadata: + name: "{{ cifmw_reboot_dep_name }}" + namespace: "{{ cifmw_update_namespace }}" + spec: + nodeSets: "{{ cifmw_update_node_sets.stdout + | split('\n') + | map('trim') + | reject('equalto', '') + | list + }}" + servicesOverride: + - reboot-os + ansibleExtraVars: + edpm_reboot_strategy: force + ansibleLimit: "{{ cifmw_update_hypervisor_short_name }}" + +- name: Create the OpenStackDataPlaneDeployment CR to trigger a reboot + environment: + KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}" + PATH: "{{ cifmw_path | default(ansible_env.PATH) }}" + ansible.builtin.command: >- + oc -n {{ cifmw_update_namespace }} + create -f {{ cifmw_update_artifacts_basedir }}/{{ cifmw_reboot_dep_name }}.yaml + +- name: Wait for the reboot to finish + environment: + KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}" + PATH: "{{ cifmw_path | default(ansible_env.PATH) }}" + ansible.builtin.command: >- + oc -n {{ cifmw_update_namespace }} + wait --for=condition=SetupComplete + openstackdataplanedeployment/{{ cifmw_reboot_dep_name }} + --timeout={{ (cifmw_update_wait_retries_reboot | int * 5) }}s diff --git a/roles/update/templates/monitor_servers.sh.j2 b/roles/update/templates/monitor_servers.sh.j2 new file mode 100644 index 0000000000..41e6e1d37f --- /dev/null +++ b/roles/update/templates/monitor_servers.sh.j2 @@ -0,0 +1,63 @@ +#!/bin/bash + +set -e +set -o pipefail + +servers=( +{% for server in hypervisor_hostnames %} +{{ server.split('.')[0] }} +{% endfor %} +) + +log_file="{{ cifmw_update_artifacts_basedir }}/monitor_servers.log" +pid_file="{{ cifmw_update_artifacts_basedir }}/monitor_servers.pid" + +# Write the script's PID to the file +echo $$ > "$pid_file" + +# Function to check server status via SSH +# TODO: ping always replies even if server is down, so using SSH instead. +check_servers() { + for server in "${servers[@]}"; do + if ssh -i {{ cifmw_update_ansible_ssh_private_key_file }} -o BatchMode=yes -o ConnectTimeout=5 "$server" "exit" &> /dev/null; then + # Server is up + if [ "${server_status[$server]}" == "down" ]; then + echo "$(date '+%Y-%m-%d %H:%M:%S') - $server is UP" | tee -a "$log_file" + server_status[$server]="up" + fi + else + # Server is down + if [ "${server_status[$server]}" != "down" ]; then + echo "$(date '+%Y-%m-%d %H:%M:%S') - $server is DOWN" | tee -a "$log_file" + server_status[$server]="down" + fi + fi + done +} + +# Function to handle script termination +cleanup() { + TERMINATE=true + echo "$(date '+%Y-%m-%d %H:%M:%S') - Termination signal received, waiting for check_servers to complete..." | tee -a "$log_file" +} + +# Trap signals and call cleanup function +trap cleanup SIGINT SIGTERM + +# Initialize server status array +declare -A server_status +for server in "${servers[@]}"; do + server_status[$server]="unknown" +done + +# Main loop to continuously check server status +while true; do + check_servers + # Handle signal + if [ "$TERMINATE" = true ]; then + echo "$(date '+%Y-%m-%d %H:%M:%S') - Script terminated" | tee -a "$log_file" + rm -f "$pid_file" + exit 0 + fi + sleep 1 +done diff --git a/roles/update/templates/monitor_vm_placement.sh.j2 b/roles/update/templates/monitor_vm_placement.sh.j2 new file mode 100644 index 0000000000..17b3c02ba5 --- /dev/null +++ b/roles/update/templates/monitor_vm_placement.sh.j2 @@ -0,0 +1,52 @@ +#!/bin/bash +# Log the instance hypervisor. Useful when tracking compute reboot. +set -e +set -o pipefail + +export KUBECONFIG="{{ cifmw_openshift_kubeconfig }}" +export PATH="{{ cifmw_path }}" + +log_file="{{ cifmw_update_artifacts_basedir }}/monitor_vm_placement.log" +pid_file="{{ cifmw_update_artifacts_basedir }}/monitor_vm_placement.pid" + +# Write the script's PID to the file +echo $$ > "$pid_file" + +source_file="{{ cifmw_update_artifacts_basedir }}/workload_suffix" + +. "$source_file" + +instance_name="instance_${SUFFIX}" +previous_hypervisor="" + +# Trap signals and call cleanup function +trap cleanup SIGINT SIGTERM + +# Function to handle script termination +cleanup() { + TERMINATE=true + echo "$(date '+%Y-%m-%d %H:%M:%S') - Termination signal received, \ +waiting for monitor_vm_placement to complete..." | \ + tee -a "$log_file" +} + +while true; do + # Handle signal + if [ "$TERMINATE" = true ]; then + echo "$(date '+%Y-%m-%d %H:%M:%S') - Script terminated" | tee -a "$log_file" + rm -f "$pid_file" + exit 0 + fi + # If the vm has a new hypervisor, log it. + current_hypervisor=$( + oc rsh -n {{ cifmw_update_namespace }} openstackclient \ + openstack server show "${instance_name}" -f json | \ + jq -r -c '.["OS-EXT-SRV-ATTR:host"]' + ) + if [[ "$current_hypervisor" != "$previous_hypervisor" ]]; then + echo "$(date '+%Y-%m-%d %H:%M:%S') $instance_name $current_hypervisor" \ + >> "$log_file" + previous_hypervisor="$current_hypervisor" + fi + sleep 1 +done