From 0169ded6717ac0c7f5ab188d2e8c2d7afc2eec7d Mon Sep 17 00:00:00 2001 From: Zack Galbreath Date: Fri, 10 Jan 2025 14:16:16 -0500 Subject: [PATCH] New cron job to automatically retry failed trigger jobs Mitigates #1031 --- .github/workflows/custom_docker_builds.yml | 2 + images/retry-trigger-jobs/Dockerfile | 17 ++ images/retry-trigger-jobs/README.md | 11 ++ images/retry-trigger-jobs/requirements.txt | 2 + .../retry-trigger-jobs/retry_trigger_jobs.py | 146 ++++++++++++++++++ .../custom/retry-trigger-jobs/cron-jobs.yaml | 32 ++++ 6 files changed, 210 insertions(+) create mode 100644 images/retry-trigger-jobs/Dockerfile create mode 100644 images/retry-trigger-jobs/README.md create mode 100644 images/retry-trigger-jobs/requirements.txt create mode 100644 images/retry-trigger-jobs/retry_trigger_jobs.py create mode 100644 k8s/production/custom/retry-trigger-jobs/cron-jobs.yaml diff --git a/.github/workflows/custom_docker_builds.yml b/.github/workflows/custom_docker_builds.yml index fc252bd31..844143478 100644 --- a/.github/workflows/custom_docker_builds.yml +++ b/.github/workflows/custom_docker_builds.yml @@ -47,6 +47,8 @@ jobs: image-tags: ghcr.io/spack/ci-prune-buildcache:0.0.4 - docker-image: ./images/protected-publish image-tags: ghcr.io/spack/protected-publish:0.0.2 + - docker-image: ./images/retry-trigger-jobs + image-tags: ghcr.io/spack/retry-trigger-jobs:0.0.1 steps: - name: Checkout uses: actions/checkout@1d96c772d19495a3b5c517cd2bc0cb401ea0529f # v4.1.3 diff --git a/images/retry-trigger-jobs/Dockerfile b/images/retry-trigger-jobs/Dockerfile new file mode 100644 index 000000000..439979190 --- /dev/null +++ b/images/retry-trigger-jobs/Dockerfile @@ -0,0 +1,17 @@ +FROM python:3.11 + +RUN apt-get update && apt-get install -y \ + gpg \ + gpg-agent && \ + apt-get autoremove --purge -y && \ + apt-get clean + +COPY requirements.txt /srcs/requirements.txt + +RUN pip install --upgrade pip && \ + pip install --no-cache-dir -r /srcs/requirements.txt + +COPY retry_trigger_jobs.py /srcs/retry_trigger_jobs.py + +WORKDIR /srcs +ENTRYPOINT ["python", "retry_trigger_jobs.py"] diff --git a/images/retry-trigger-jobs/README.md b/images/retry-trigger-jobs/README.md new file mode 100644 index 000000000..00238586c --- /dev/null +++ b/images/retry-trigger-jobs/README.md @@ -0,0 +1,11 @@ +# Purpose + +The purpose of this script is to retry child pipelines whose generate job initially failed, but succeeded upon retry. + +## Background + +This [issue](https://github.com/spack/spack-infrastructure/issues/1031) describes the problem in more detail. + +## Mitigation + +Periodically search for recent child pipelines that match this failure condition and retry them using the GitLab API. diff --git a/images/retry-trigger-jobs/requirements.txt b/images/retry-trigger-jobs/requirements.txt new file mode 100644 index 000000000..2276dc664 --- /dev/null +++ b/images/retry-trigger-jobs/requirements.txt @@ -0,0 +1,2 @@ +requests==2.31.0 +sentry-sdk==1.32.0 diff --git a/images/retry-trigger-jobs/retry_trigger_jobs.py b/images/retry-trigger-jobs/retry_trigger_jobs.py new file mode 100644 index 000000000..f188cd4da --- /dev/null +++ b/images/retry-trigger-jobs/retry_trigger_jobs.py @@ -0,0 +1,146 @@ +import argparse +import json +import os +import urllib.parse +from datetime import datetime, timedelta, timezone +from requests import Session +from requests.adapters import HTTPAdapter, Retry + +import sentry_sdk + + +sentry_sdk.init(traces_sample_rate=0.01) + +GITLAB_API_URL = "https://gitlab.spack.io/api/v4/projects/2" + + +def paginate(session, query_url): + """Helper method to get all pages of paginated query results""" + results = [] + + while query_url: + resp = session.get(query_url) + + resp.raise_for_status() + + next_batch = json.loads(resp.content) + + for result in next_batch: + results.append(result) + + if "next" in resp.links: + query_url = resp.links["next"]["url"] + else: + query_url = None + + return results + + +def print_response(resp, padding=''): + """Helper method to print response status code and content""" + print(f"{padding}response code: {resp.status_code}") + print(f"{padding}response value: {resp.text}") + + +def retry_trigger_jobs(last_n_hours): + """Analyze pipelines updated over the last_n_hours to find and retry + child pipelines whose generate jobs failed initially but succeeded + upon retry.""" + + # Set up a Requests session with backoff, retries, and credentials. + session = Session() + session.mount( + "https://", + HTTPAdapter( + max_retries=Retry( + total=5, + backoff_factor=2, + backoff_jitter=1, + ), + ), + ) + session.headers.update({ + 'PRIVATE-TOKEN': os.environ.get("GITLAB_TOKEN", None) + }) + + # Iterate over recent pipelines. + dt = datetime.now(timezone.utc) - timedelta(hours=last_n_hours) + time_threshold = urllib.parse.quote_plus(dt.isoformat(timespec="seconds")) + pipelines_url = f"{GITLAB_API_URL}/pipelines?updated_after={time_threshold}" + pipelines = paginate(session, pipelines_url) + for pipeline in pipelines: + print(f"Checking pipeline {pipeline['id']}: {pipeline['ref']}") + # Iterate over the trigger jobs ("bridges") for this parent pipeline. + parent_id = pipeline['id'] + bridges_url = f"{GITLAB_API_URL}/pipelines/{parent_id}/bridges" + bridges = paginate(session, bridges_url) + for bridge in bridges: + if not bridge["downstream_pipeline"]: + continue + child_pipeline = bridge["downstream_pipeline"] + child_id = child_pipeline["id"] + # Carefully try to detect the particular failure case we're interested in here. + # + # 1) The child pipeline failed. + if child_pipeline["status"] != "failed": + continue + + # 2) The trigger job reports an "unknown_failure". + if bridge["failure_reason"] != "unknown_failure": + continue + + # 3) The child pipeline does not have any jobs. + child_jobs_url = f"{GITLAB_API_URL}/pipelines/{child_id}/jobs" + child_jobs = paginate(session, child_jobs_url) + if len(child_jobs) != 0: + continue + + # 4) The generate job failed but succeeded upon retry. + # GitLab API unfortunately doesn't provide a clean way to + # find the relevant generate job for a given trigger job, + # so we get all the jobs for the parent pipeline and look + # for those with a particular name. + generate_job_name = bridge["name"].replace("-build", "-generate") + found_success = False + found_failed = False + parent_jobs_url = f"{GITLAB_API_URL}/pipelines/{parent_id}/jobs?include_retried=true" + parent_jobs = paginate(session, parent_jobs_url) + for job in parent_jobs: + if job["name"] == generate_job_name: + if job["status"] == "success": + found_success = True + elif job["status"] == "failed": + found_failed = True + if found_success and found_failed: + # If we found at least one success and one failed + # generate job, retry the trigger job to fix the + # child pipeline. + print(f"!!! Retrying job #{bridge['id']} to fix pipeline {child_id}") + retry_url = f"{GITLAB_API_URL}/jobs/{bridge['id']}/retry" + print_response(session.post(retry_url)) + break + + +def main(): + """Script entrypoint""" + if "GITLAB_TOKEN" not in os.environ: + raise Exception("GITLAB_TOKEN environment is not set") + + parser = argparse.ArgumentParser( + prog="retry_trigger_jobs.py", + description="Retry child pipelines that failed to generate initially", + ) + parser.add_argument( + "hours", + type=int, + help="Number of hours to look back for failed child pipelines" + ) + args = parser.parse_args() + + retry_trigger_jobs(args.hours) + + +################################################################################ +# +if __name__ == "__main__": + main() diff --git a/k8s/production/custom/retry-trigger-jobs/cron-jobs.yaml b/k8s/production/custom/retry-trigger-jobs/cron-jobs.yaml new file mode 100644 index 000000000..a39a6a539 --- /dev/null +++ b/k8s/production/custom/retry-trigger-jobs/cron-jobs.yaml @@ -0,0 +1,32 @@ +--- +apiVersion: batch/v1 +kind: CronJob +metadata: + name: retry-trigger-jobs + namespace: custom +spec: + schedule: "*/15 * * * *" # Every 15 minutes + concurrencyPolicy: Forbid + jobTemplate: + spec: + activeDeadlineSeconds: 3600 # terminate any running job after 1 hour + backoffLimit: 0 + template: + spec: + serviceAccountName: retry-trigger-jobs + restartPolicy: Never + containers: + - name: retry-trigger-jobs + image: ghcr.io/spack/retry-trigger-jobs:0.0.1 + imagePullPolicy: IfNotPresent + resources: + requests: + cpu: 500m + memory: 50M + args: + - "2" + envFrom: + - configMapRef: + name: python-scripts-sentry-config + nodeSelector: + spack.io/node-pool: base