Skip to content

Commit

Permalink
Adding a metric to count the number of jobs stuck in accepted state (#…
Browse files Browse the repository at this point in the history
…571)

Co-authored-by: Sundaram Ananthanarayanan <[email protected]>
  • Loading branch information
sundargates and sundargates authored Oct 20, 2023
1 parent 6ddc00f commit 2b1ad2d
Showing 1 changed file with 4 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,7 @@ public class JobClusterActor extends AbstractActorWithTimers implements IJobClus
private final Counter numJobClusterDeleteErrors;
private final Counter numJobClusterUpdate;
private final Counter numJobClusterUpdateErrors;
private final Counter numJobsStuckInAccepted;
private final Counter numSLAEnforcementExecutions;


Expand Down Expand Up @@ -261,6 +262,7 @@ public JobClusterActor(
.addCounter("numJobClusterUpdate")
.addCounter("numJobClusterUpdateErrors")
.addCounter("numSLAEnforcementExecutions")
.addCounter("numJobsStuckInAccepted")
.addGauge(new GaugeCallback(metricGroupId, "acceptedJobsGauge", () -> 1.0 * this.jobManager.acceptedJobsCount()))
.addGauge(new GaugeCallback(metricGroupId, "activeJobsGauge", () -> 1.0 * this.jobManager.activeJobsCount()))
.addGauge(new GaugeCallback(metricGroupId, "terminatingJobsGauge", () -> 1.0 * this.jobManager.terminatingJobsMap.size()))
Expand All @@ -284,6 +286,7 @@ public JobClusterActor(
this.numJobClusterDeleteErrors = m.getCounter("numJobClusterDeleteErrors");
this.numJobClusterUpdateErrors = m.getCounter("numJobClusterUpdateErrors");
this.numSLAEnforcementExecutions = m.getCounter("numSLAEnforcementExecutions");
this.numJobsStuckInAccepted = m.getCounter("numJobsStuckInAccepted");
}


Expand Down Expand Up @@ -1976,6 +1979,7 @@ public void onEnforceSLARequest(JobClusterProto.EnforceSLARequest request) {
List<JobInfo> pendingInitializationJobsPriorToCutoff = jobManager.getJobActorsStuckInInit(now, getExpirePendingInitializeDelayMs());

List<JobInfo> jobsStuckInAcceptedList = jobManager.getJobsStuckInAccepted(now, getExpireAcceptedDelayMs());
numJobsStuckInAccepted.increment(jobsStuckInAcceptedList.size());

List<JobInfo> jobsStuckInTerminatingList = jobManager.getJobsStuckInTerminating(now, getExpireAcceptedDelayMs());

Expand Down

0 comments on commit 2b1ad2d

Please sign in to comment.