-
Notifications
You must be signed in to change notification settings - Fork 21
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
New cron job to automatically retry failed trigger jobs
Mitigates #1031
- Loading branch information
1 parent
fbe19a4
commit 0169ded
Showing
6 changed files
with
210 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
FROM python:3.11 | ||
|
||
RUN apt-get update && apt-get install -y \ | ||
gpg \ | ||
gpg-agent && \ | ||
apt-get autoremove --purge -y && \ | ||
apt-get clean | ||
|
||
COPY requirements.txt /srcs/requirements.txt | ||
|
||
RUN pip install --upgrade pip && \ | ||
pip install --no-cache-dir -r /srcs/requirements.txt | ||
|
||
COPY retry_trigger_jobs.py /srcs/retry_trigger_jobs.py | ||
|
||
WORKDIR /srcs | ||
ENTRYPOINT ["python", "retry_trigger_jobs.py"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
# Purpose | ||
|
||
The purpose of this script is to retry child pipelines whose generate job initially failed, but succeeded upon retry. | ||
|
||
## Background | ||
|
||
This [issue](https://github.com/spack/spack-infrastructure/issues/1031) describes the problem in more detail. | ||
|
||
## Mitigation | ||
|
||
Periodically search for recent child pipelines that match this failure condition and retry them using the GitLab API. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
requests==2.31.0 | ||
sentry-sdk==1.32.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,146 @@ | ||
import argparse | ||
import json | ||
import os | ||
import urllib.parse | ||
from datetime import datetime, timedelta, timezone | ||
from requests import Session | ||
from requests.adapters import HTTPAdapter, Retry | ||
|
||
import sentry_sdk | ||
|
||
|
||
sentry_sdk.init(traces_sample_rate=0.01) | ||
|
||
GITLAB_API_URL = "https://gitlab.spack.io/api/v4/projects/2" | ||
|
||
|
||
def paginate(session, query_url): | ||
"""Helper method to get all pages of paginated query results""" | ||
results = [] | ||
|
||
while query_url: | ||
resp = session.get(query_url) | ||
|
||
resp.raise_for_status() | ||
|
||
next_batch = json.loads(resp.content) | ||
|
||
for result in next_batch: | ||
results.append(result) | ||
|
||
if "next" in resp.links: | ||
query_url = resp.links["next"]["url"] | ||
else: | ||
query_url = None | ||
|
||
return results | ||
|
||
|
||
def print_response(resp, padding=''): | ||
"""Helper method to print response status code and content""" | ||
print(f"{padding}response code: {resp.status_code}") | ||
print(f"{padding}response value: {resp.text}") | ||
|
||
|
||
def retry_trigger_jobs(last_n_hours): | ||
"""Analyze pipelines updated over the last_n_hours to find and retry | ||
child pipelines whose generate jobs failed initially but succeeded | ||
upon retry.""" | ||
|
||
# Set up a Requests session with backoff, retries, and credentials. | ||
session = Session() | ||
session.mount( | ||
"https://", | ||
HTTPAdapter( | ||
max_retries=Retry( | ||
total=5, | ||
backoff_factor=2, | ||
backoff_jitter=1, | ||
), | ||
), | ||
) | ||
session.headers.update({ | ||
'PRIVATE-TOKEN': os.environ.get("GITLAB_TOKEN", None) | ||
}) | ||
|
||
# Iterate over recent pipelines. | ||
dt = datetime.now(timezone.utc) - timedelta(hours=last_n_hours) | ||
time_threshold = urllib.parse.quote_plus(dt.isoformat(timespec="seconds")) | ||
pipelines_url = f"{GITLAB_API_URL}/pipelines?updated_after={time_threshold}" | ||
pipelines = paginate(session, pipelines_url) | ||
for pipeline in pipelines: | ||
print(f"Checking pipeline {pipeline['id']}: {pipeline['ref']}") | ||
# Iterate over the trigger jobs ("bridges") for this parent pipeline. | ||
parent_id = pipeline['id'] | ||
bridges_url = f"{GITLAB_API_URL}/pipelines/{parent_id}/bridges" | ||
bridges = paginate(session, bridges_url) | ||
for bridge in bridges: | ||
if not bridge["downstream_pipeline"]: | ||
continue | ||
child_pipeline = bridge["downstream_pipeline"] | ||
child_id = child_pipeline["id"] | ||
# Carefully try to detect the particular failure case we're interested in here. | ||
# | ||
# 1) The child pipeline failed. | ||
if child_pipeline["status"] != "failed": | ||
continue | ||
|
||
# 2) The trigger job reports an "unknown_failure". | ||
if bridge["failure_reason"] != "unknown_failure": | ||
continue | ||
|
||
# 3) The child pipeline does not have any jobs. | ||
child_jobs_url = f"{GITLAB_API_URL}/pipelines/{child_id}/jobs" | ||
child_jobs = paginate(session, child_jobs_url) | ||
if len(child_jobs) != 0: | ||
continue | ||
|
||
# 4) The generate job failed but succeeded upon retry. | ||
# GitLab API unfortunately doesn't provide a clean way to | ||
# find the relevant generate job for a given trigger job, | ||
# so we get all the jobs for the parent pipeline and look | ||
# for those with a particular name. | ||
generate_job_name = bridge["name"].replace("-build", "-generate") | ||
found_success = False | ||
found_failed = False | ||
parent_jobs_url = f"{GITLAB_API_URL}/pipelines/{parent_id}/jobs?include_retried=true" | ||
parent_jobs = paginate(session, parent_jobs_url) | ||
for job in parent_jobs: | ||
if job["name"] == generate_job_name: | ||
if job["status"] == "success": | ||
found_success = True | ||
elif job["status"] == "failed": | ||
found_failed = True | ||
if found_success and found_failed: | ||
# If we found at least one success and one failed | ||
# generate job, retry the trigger job to fix the | ||
# child pipeline. | ||
print(f"!!! Retrying job #{bridge['id']} to fix pipeline {child_id}") | ||
retry_url = f"{GITLAB_API_URL}/jobs/{bridge['id']}/retry" | ||
print_response(session.post(retry_url)) | ||
break | ||
|
||
|
||
def main(): | ||
"""Script entrypoint""" | ||
if "GITLAB_TOKEN" not in os.environ: | ||
raise Exception("GITLAB_TOKEN environment is not set") | ||
|
||
parser = argparse.ArgumentParser( | ||
prog="retry_trigger_jobs.py", | ||
description="Retry child pipelines that failed to generate initially", | ||
) | ||
parser.add_argument( | ||
"hours", | ||
type=int, | ||
help="Number of hours to look back for failed child pipelines" | ||
) | ||
args = parser.parse_args() | ||
|
||
retry_trigger_jobs(args.hours) | ||
|
||
|
||
################################################################################ | ||
# | ||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
--- | ||
apiVersion: batch/v1 | ||
kind: CronJob | ||
metadata: | ||
name: retry-trigger-jobs | ||
namespace: custom | ||
spec: | ||
schedule: "*/15 * * * *" # Every 15 minutes | ||
concurrencyPolicy: Forbid | ||
jobTemplate: | ||
spec: | ||
activeDeadlineSeconds: 3600 # terminate any running job after 1 hour | ||
backoffLimit: 0 | ||
template: | ||
spec: | ||
serviceAccountName: retry-trigger-jobs | ||
restartPolicy: Never | ||
containers: | ||
- name: retry-trigger-jobs | ||
image: ghcr.io/spack/retry-trigger-jobs:0.0.1 | ||
imagePullPolicy: IfNotPresent | ||
resources: | ||
requests: | ||
cpu: 500m | ||
memory: 50M | ||
args: | ||
- "2" | ||
envFrom: | ||
- configMapRef: | ||
name: python-scripts-sentry-config | ||
nodeSelector: | ||
spack.io/node-pool: base |