Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New cron job to automatically retry failed trigger jobs #1035

Merged
merged 1 commit into from
Jan 15, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/custom_docker_builds.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ jobs:
image-tags: ghcr.io/spack/ci-prune-buildcache:0.0.4
- docker-image: ./images/protected-publish
image-tags: ghcr.io/spack/protected-publish:0.0.2
- docker-image: ./images/retry-trigger-jobs
image-tags: ghcr.io/spack/retry-trigger-jobs:0.0.1
steps:
- name: Checkout
uses: actions/checkout@1d96c772d19495a3b5c517cd2bc0cb401ea0529f # v4.1.3
Expand Down
17 changes: 17 additions & 0 deletions images/retry-trigger-jobs/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
FROM python:3.11

RUN apt-get update && apt-get install -y \
gpg \
gpg-agent && \
apt-get autoremove --purge -y && \
apt-get clean

COPY requirements.txt /srcs/requirements.txt

RUN pip install --upgrade pip && \
pip install --no-cache-dir -r /srcs/requirements.txt

COPY retry_trigger_jobs.py /srcs/retry_trigger_jobs.py

WORKDIR /srcs
ENTRYPOINT ["python", "retry_trigger_jobs.py"]
11 changes: 11 additions & 0 deletions images/retry-trigger-jobs/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Purpose

The purpose of this script is to retry child pipelines whose generate job initially failed, but succeeded upon retry.

## Background

This [issue](https://github.com/spack/spack-infrastructure/issues/1031) describes the problem in more detail.

## Mitigation

Periodically search for recent child pipelines that match this failure condition and retry them using the GitLab API.
2 changes: 2 additions & 0 deletions images/retry-trigger-jobs/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
requests==2.31.0
sentry-sdk==1.32.0
146 changes: 146 additions & 0 deletions images/retry-trigger-jobs/retry_trigger_jobs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
import argparse
import json
import os
import urllib.parse
from datetime import datetime, timedelta, timezone
from requests import Session
from requests.adapters import HTTPAdapter, Retry

import sentry_sdk


sentry_sdk.init(traces_sample_rate=0.01)

GITLAB_API_URL = "https://gitlab.spack.io/api/v4/projects/2"


def paginate(session, query_url):
"""Helper method to get all pages of paginated query results"""
results = []

while query_url:
resp = session.get(query_url)

resp.raise_for_status()

next_batch = json.loads(resp.content)

for result in next_batch:
results.append(result)

if "next" in resp.links:
query_url = resp.links["next"]["url"]
else:
query_url = None

return results


def print_response(resp, padding=''):
"""Helper method to print response status code and content"""
print(f"{padding}response code: {resp.status_code}")
print(f"{padding}response value: {resp.text}")


def retry_trigger_jobs(last_n_hours):
"""Analyze pipelines updated over the last_n_hours to find and retry
child pipelines whose generate jobs failed initially but succeeded
upon retry."""

# Set up a Requests session with backoff, retries, and credentials.
session = Session()
session.mount(
"https://",
HTTPAdapter(
max_retries=Retry(
total=5,
backoff_factor=2,
backoff_jitter=1,
),
),
)
session.headers.update({
'PRIVATE-TOKEN': os.environ.get("GITLAB_TOKEN", None)
})

# Iterate over recent pipelines.
dt = datetime.now(timezone.utc) - timedelta(hours=last_n_hours)
time_threshold = urllib.parse.quote_plus(dt.isoformat(timespec="seconds"))
pipelines_url = f"{GITLAB_API_URL}/pipelines?updated_after={time_threshold}"
pipelines = paginate(session, pipelines_url)
for pipeline in pipelines:
print(f"Checking pipeline {pipeline['id']}: {pipeline['ref']}")
# Iterate over the trigger jobs ("bridges") for this parent pipeline.
parent_id = pipeline['id']
bridges_url = f"{GITLAB_API_URL}/pipelines/{parent_id}/bridges"
bridges = paginate(session, bridges_url)
for bridge in bridges:
if not bridge["downstream_pipeline"]:
continue
child_pipeline = bridge["downstream_pipeline"]
child_id = child_pipeline["id"]
# Carefully try to detect the particular failure case we're interested in here.
#
# 1) The child pipeline failed.
if child_pipeline["status"] != "failed":
continue

# 2) The trigger job reports an "unknown_failure".
if bridge["failure_reason"] != "unknown_failure":
continue

# 3) The child pipeline does not have any jobs.
child_jobs_url = f"{GITLAB_API_URL}/pipelines/{child_id}/jobs"
child_jobs = paginate(session, child_jobs_url)
if len(child_jobs) != 0:
continue

# 4) The generate job failed but succeeded upon retry.
# GitLab API unfortunately doesn't provide a clean way to
# find the relevant generate job for a given trigger job,
# so we get all the jobs for the parent pipeline and look
# for those with a particular name.
generate_job_name = bridge["name"].replace("-build", "-generate")
found_success = False
found_failed = False
parent_jobs_url = f"{GITLAB_API_URL}/pipelines/{parent_id}/jobs?include_retried=true"
parent_jobs = paginate(session, parent_jobs_url)
for job in parent_jobs:
if job["name"] == generate_job_name:
if job["status"] == "success":
found_success = True
elif job["status"] == "failed":
found_failed = True
if found_success and found_failed:
# If we found at least one success and one failed
# generate job, retry the trigger job to fix the
# child pipeline.
print(f"!!! Retrying job #{bridge['id']} to fix pipeline {child_id}")
retry_url = f"{GITLAB_API_URL}/jobs/{bridge['id']}/retry"
print_response(session.post(retry_url))
break


def main():
"""Script entrypoint"""
if "GITLAB_TOKEN" not in os.environ:
raise Exception("GITLAB_TOKEN environment is not set")

parser = argparse.ArgumentParser(
prog="retry_trigger_jobs.py",
description="Retry child pipelines that failed to generate initially",
)
parser.add_argument(
"hours",
type=int,
help="Number of hours to look back for failed child pipelines"
)
args = parser.parse_args()

retry_trigger_jobs(args.hours)


################################################################################
#
if __name__ == "__main__":
main()
32 changes: 32 additions & 0 deletions k8s/production/custom/retry-trigger-jobs/cron-jobs.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
---
apiVersion: batch/v1
kind: CronJob
metadata:
name: retry-trigger-jobs
namespace: custom
spec:
schedule: "*/15 * * * *" # Every 15 minutes
concurrencyPolicy: Forbid
jobTemplate:
spec:
activeDeadlineSeconds: 3600 # terminate any running job after 1 hour
backoffLimit: 0
template:
spec:
serviceAccountName: retry-trigger-jobs
restartPolicy: Never
containers:
- name: retry-trigger-jobs
image: ghcr.io/spack/retry-trigger-jobs:0.0.1
imagePullPolicy: IfNotPresent
resources:
requests:
cpu: 500m
memory: 50M
args:
- "2"
envFrom:
- configMapRef:
name: python-scripts-sentry-config
nodeSelector:
spack.io/node-pool: base
Loading