Skip to content

Commit 6324a26

Browse files
authored
Merge pull request #2269 from HubSpot/retry-tasks-with-new-deploys
Retry failed tasks with new deploys
2 parents 3279591 + dd71fbb commit 6324a26

2 files changed

Lines changed: 116 additions & 8 deletions

File tree

SingularityService/src/main/java/com/hubspot/singularity/scheduler/SingularityScheduler.java

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -428,7 +428,7 @@ private void handlePendingRequestsForDeployKey(
428428
);
429429

430430
if (!isRequestActive(maybeRequest)) {
431-
LOG.debug(
431+
LOG.info(
432432
"Pending request {} was obsolete (request {})",
433433
requestId,
434434
SingularityRequestWithState.getRequestState(maybeRequest)
@@ -475,7 +475,7 @@ private void handlePendingRequestsForDeployKey(
475475
maybeRequestDeployState
476476
)
477477
) {
478-
LOG.debug(
478+
LOG.info(
479479
"Pending request {} was obsolete (request {})",
480480
pendingRequest,
481481
SingularityRequestWithState.getRequestState(maybeRequest)
@@ -1014,12 +1014,28 @@ private Optional<PendingType> handleCompletedTaskWithStatistics(
10141014
);
10151015

10161016
if (!isDeployInUse(requestDeployState, taskId.getDeployId(), true)) {
1017-
LOG.debug(
1018-
"Task {} completed, but it didn't match active deploy state {} - ignoring",
1019-
taskId.getId(),
1020-
requestDeployState
1021-
);
1022-
return Optional.empty();
1017+
// failed tasks with remaining retries cannot short circuit here
1018+
// isFailed is false for lost and other weird task states
1019+
if (
1020+
!state.isSuccess() &&
1021+
(
1022+
deployStatistics.getNumSequentialRetries() <
1023+
request.getNumRetriesOnFailure().orElse(0)
1024+
)
1025+
) {
1026+
LOG.info(
1027+
"Task {} failed, but it didn't match active deploy state {} - may retry with current deploy state",
1028+
taskId.getId(),
1029+
requestDeployState
1030+
);
1031+
} else {
1032+
LOG.info(
1033+
"Task {} completed, but it didn't match active deploy state {} - ignoring",
1034+
taskId.getId(),
1035+
requestDeployState
1036+
);
1037+
return Optional.empty();
1038+
}
10231039
}
10241040

10251041
if (

SingularityService/src/test/java/com/hubspot/singularity/scheduler/SingularitySchedulerTest.java

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1255,6 +1255,98 @@ public void testRetriesWithOverrides() {
12551255
Assertions.assertTrue(taskManager.getActiveTaskIds().isEmpty());
12561256
}
12571257

1258+
@Test
1259+
public void testRetriesWithNewDeploys() {
1260+
SingularityRequestBuilder bldr = new SingularityRequestBuilder(
1261+
requestId,
1262+
RequestType.RUN_ONCE
1263+
);
1264+
request = bldr.setNumRetriesOnFailure(Optional.of(2)).build();
1265+
saveRequest(request);
1266+
1267+
deployResource.deploy(
1268+
new SingularityDeployRequest(
1269+
new SingularityDeployBuilder(requestId, "d1")
1270+
.setCommand(Optional.of("cmd"))
1271+
.build(),
1272+
Optional.empty(),
1273+
Optional.empty()
1274+
),
1275+
singularityUser
1276+
);
1277+
1278+
scheduler.drainPendingQueue();
1279+
deployChecker.checkDeploys();
1280+
scheduler.drainPendingQueue();
1281+
resourceOffers();
1282+
1283+
Assertions.assertEquals(1, taskManager.getActiveTaskIds().size());
1284+
1285+
deployResource.deploy(
1286+
new SingularityDeployRequest(
1287+
new SingularityDeployBuilder(requestId, "d2")
1288+
.setCommand(Optional.of("cmd"))
1289+
.build(),
1290+
Optional.empty(),
1291+
Optional.empty()
1292+
),
1293+
singularityUser
1294+
);
1295+
1296+
statusUpdate(taskManager.getActiveTasks().get(0), TaskState.TASK_LOST);
1297+
1298+
scheduler.drainPendingQueue();
1299+
deployChecker.checkDeploys();
1300+
scheduler.drainPendingQueue();
1301+
resourceOffers();
1302+
1303+
Assertions.assertEquals(1, taskManager.getActiveTaskIds().size());
1304+
Assertions.assertEquals("d2", taskManager.getActiveTaskIds().get(0).getDeployId());
1305+
1306+
deployResource.deploy(
1307+
new SingularityDeployRequest(
1308+
new SingularityDeployBuilder(requestId, "d3")
1309+
.setCommand(Optional.of("cmd"))
1310+
.build(),
1311+
Optional.empty(),
1312+
Optional.empty()
1313+
),
1314+
singularityUser
1315+
);
1316+
1317+
statusUpdate(taskManager.getActiveTasks().get(0), TaskState.TASK_LOST);
1318+
1319+
scheduler.drainPendingQueue();
1320+
deployChecker.checkDeploys();
1321+
scheduler.drainPendingQueue();
1322+
resourceOffers();
1323+
1324+
Assertions.assertEquals(1, taskManager.getActiveTaskIds().size());
1325+
Assertions.assertEquals("d3", taskManager.getActiveTaskIds().get(0).getDeployId());
1326+
1327+
deployResource.deploy(
1328+
new SingularityDeployRequest(
1329+
new SingularityDeployBuilder(requestId, "d4")
1330+
.setCommand(Optional.of("cmd"))
1331+
.build(),
1332+
Optional.empty(),
1333+
Optional.empty()
1334+
),
1335+
singularityUser
1336+
);
1337+
1338+
statusUpdate(taskManager.getActiveTasks().get(0), TaskState.TASK_LOST);
1339+
1340+
scheduler.drainPendingQueue();
1341+
deployChecker.checkDeploys();
1342+
scheduler.drainPendingQueue();
1343+
resourceOffers();
1344+
1345+
// TODO - new deploys have new deploy statistics, so we lose track of the # of retries
1346+
Assertions.assertEquals(1, taskManager.getActiveTaskIds().size());
1347+
Assertions.assertEquals("d4", taskManager.getActiveTaskIds().get(0).getDeployId());
1348+
}
1349+
12581350
/* @Test
12591351
public void testCooldownAfterSequentialFailures() {
12601352
initRequest();

0 commit comments

Comments
 (0)