Skip to content

Commit

Permalink
New cron job to automatically retry failed trigger jobs
Browse files Browse the repository at this point in the history
Mitigates #1031
  • Loading branch information
zackgalbreath committed Jan 13, 2025
1 parent fbe19a4 commit 0169ded
Show file tree
Hide file tree
Showing 6 changed files with 210 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/custom_docker_builds.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ jobs:
image-tags: ghcr.io/spack/ci-prune-buildcache:0.0.4
- docker-image: ./images/protected-publish
image-tags: ghcr.io/spack/protected-publish:0.0.2
- docker-image: ./images/retry-trigger-jobs
image-tags: ghcr.io/spack/retry-trigger-jobs:0.0.1
steps:
- name: Checkout
uses: actions/checkout@1d96c772d19495a3b5c517cd2bc0cb401ea0529f # v4.1.3
Expand Down
17 changes: 17 additions & 0 deletions images/retry-trigger-jobs/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
FROM python:3.11

RUN apt-get update && apt-get install -y \
gpg \
gpg-agent && \
apt-get autoremove --purge -y && \
apt-get clean

COPY requirements.txt /srcs/requirements.txt

RUN pip install --upgrade pip && \
pip install --no-cache-dir -r /srcs/requirements.txt

COPY retry_trigger_jobs.py /srcs/retry_trigger_jobs.py

WORKDIR /srcs
ENTRYPOINT ["python", "retry_trigger_jobs.py"]
11 changes: 11 additions & 0 deletions images/retry-trigger-jobs/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Purpose

The purpose of this script is to retry child pipelines whose generate job initially failed, but succeeded upon retry.

## Background

This [issue](https://github.com/spack/spack-infrastructure/issues/1031) describes the problem in more detail.

## Mitigation

Periodically search for recent child pipelines that match this failure condition and retry them using the GitLab API.
2 changes: 2 additions & 0 deletions images/retry-trigger-jobs/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
requests==2.31.0
sentry-sdk==1.32.0
146 changes: 146 additions & 0 deletions images/retry-trigger-jobs/retry_trigger_jobs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
import argparse
import json
import os
import urllib.parse
from datetime import datetime, timedelta, timezone
from requests import Session
from requests.adapters import HTTPAdapter, Retry

import sentry_sdk


sentry_sdk.init(traces_sample_rate=0.01)

GITLAB_API_URL = "https://gitlab.spack.io/api/v4/projects/2"


def paginate(session, query_url):
"""Helper method to get all pages of paginated query results"""
results = []

while query_url:
resp = session.get(query_url)

resp.raise_for_status()

next_batch = json.loads(resp.content)

for result in next_batch:
results.append(result)

if "next" in resp.links:
query_url = resp.links["next"]["url"]
else:
query_url = None

return results


def print_response(resp, padding=''):
"""Helper method to print response status code and content"""
print(f"{padding}response code: {resp.status_code}")
print(f"{padding}response value: {resp.text}")


def retry_trigger_jobs(last_n_hours):
"""Analyze pipelines updated over the last_n_hours to find and retry
child pipelines whose generate jobs failed initially but succeeded
upon retry."""

# Set up a Requests session with backoff, retries, and credentials.
session = Session()
session.mount(
"https://",
HTTPAdapter(
max_retries=Retry(
total=5,
backoff_factor=2,
backoff_jitter=1,
),
),
)
session.headers.update({
'PRIVATE-TOKEN': os.environ.get("GITLAB_TOKEN", None)
})

# Iterate over recent pipelines.
dt = datetime.now(timezone.utc) - timedelta(hours=last_n_hours)
time_threshold = urllib.parse.quote_plus(dt.isoformat(timespec="seconds"))
pipelines_url = f"{GITLAB_API_URL}/pipelines?updated_after={time_threshold}"
pipelines = paginate(session, pipelines_url)
for pipeline in pipelines:
print(f"Checking pipeline {pipeline['id']}: {pipeline['ref']}")
# Iterate over the trigger jobs ("bridges") for this parent pipeline.
parent_id = pipeline['id']
bridges_url = f"{GITLAB_API_URL}/pipelines/{parent_id}/bridges"
bridges = paginate(session, bridges_url)
for bridge in bridges:
if not bridge["downstream_pipeline"]:
continue
child_pipeline = bridge["downstream_pipeline"]
child_id = child_pipeline["id"]
# Carefully try to detect the particular failure case we're interested in here.
#
# 1) The child pipeline failed.
if child_pipeline["status"] != "failed":
continue

# 2) The trigger job reports an "unknown_failure".
if bridge["failure_reason"] != "unknown_failure":
continue

# 3) The child pipeline does not have any jobs.
child_jobs_url = f"{GITLAB_API_URL}/pipelines/{child_id}/jobs"
child_jobs = paginate(session, child_jobs_url)
if len(child_jobs) != 0:
continue

# 4) The generate job failed but succeeded upon retry.
# GitLab API unfortunately doesn't provide a clean way to
# find the relevant generate job for a given trigger job,
# so we get all the jobs for the parent pipeline and look
# for those with a particular name.
generate_job_name = bridge["name"].replace("-build", "-generate")
found_success = False
found_failed = False
parent_jobs_url = f"{GITLAB_API_URL}/pipelines/{parent_id}/jobs?include_retried=true"
parent_jobs = paginate(session, parent_jobs_url)
for job in parent_jobs:
if job["name"] == generate_job_name:
if job["status"] == "success":
found_success = True
elif job["status"] == "failed":
found_failed = True
if found_success and found_failed:
# If we found at least one success and one failed
# generate job, retry the trigger job to fix the
# child pipeline.
print(f"!!! Retrying job #{bridge['id']} to fix pipeline {child_id}")
retry_url = f"{GITLAB_API_URL}/jobs/{bridge['id']}/retry"
print_response(session.post(retry_url))
break


def main():
"""Script entrypoint"""
if "GITLAB_TOKEN" not in os.environ:
raise Exception("GITLAB_TOKEN environment is not set")

parser = argparse.ArgumentParser(
prog="retry_trigger_jobs.py",
description="Retry child pipelines that failed to generate initially",
)
parser.add_argument(
"hours",
type=int,
help="Number of hours to look back for failed child pipelines"
)
args = parser.parse_args()

retry_trigger_jobs(args.hours)


################################################################################
#
if __name__ == "__main__":
main()
32 changes: 32 additions & 0 deletions k8s/production/custom/retry-trigger-jobs/cron-jobs.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
---
apiVersion: batch/v1
kind: CronJob
metadata:
name: retry-trigger-jobs
namespace: custom
spec:
schedule: "*/15 * * * *" # Every 15 minutes
concurrencyPolicy: Forbid
jobTemplate:
spec:
activeDeadlineSeconds: 3600 # terminate any running job after 1 hour
backoffLimit: 0
template:
spec:
serviceAccountName: retry-trigger-jobs
restartPolicy: Never
containers:
- name: retry-trigger-jobs
image: ghcr.io/spack/retry-trigger-jobs:0.0.1
imagePullPolicy: IfNotPresent
resources:
requests:
cpu: 500m
memory: 50M
args:
- "2"
envFrom:
- configMapRef:
name: python-scripts-sentry-config
nodeSelector:
spack.io/node-pool: base

0 comments on commit 0169ded

Please sign in to comment.