Skip to content

Commit 0b420cb

Browse files
authored
Merge pull request #2285 from HubSpot/task_history_when_missing
Task history handling when missing data
2 parents ae5c512 + 278eab5 commit 0b420cb

4 files changed

Lines changed: 325 additions & 206 deletions

File tree

SingularityService/src/main/java/com/hubspot/singularity/config/SingularityConfiguration.java

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -458,7 +458,6 @@ public class SingularityConfiguration extends Configuration {
458458
private boolean skipPersistingTooLongTaskIds = false;
459459

460460
private boolean allowEmptyRequestInstances = false;
461-
private boolean verifyTaskDataWrites = false;
462461

463462
public long getAskDriverToKillTasksAgainAfterMillis() {
464463
return askDriverToKillTasksAgainAfterMillis;
@@ -2152,12 +2151,4 @@ public boolean allowEmptyRequestInstances() {
21522151
public void setAllowEmptyRequestInstances(boolean allowEmptyRequestInstances) {
21532152
this.allowEmptyRequestInstances = allowEmptyRequestInstances;
21542153
}
2155-
2156-
public boolean isVerifyTaskDataWrites() {
2157-
return verifyTaskDataWrites;
2158-
}
2159-
2160-
public void setVerifyTaskDataWrites(boolean verifyTaskDataWrites) {
2161-
this.verifyTaskDataWrites = verifyTaskDataWrites;
2162-
}
21632154
}

SingularityService/src/main/java/com/hubspot/singularity/data/TaskManager.java

Lines changed: 21 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1315,15 +1315,29 @@ public void createTaskAndDeletePendingTask(SingularityTask task) {
13151315
public Optional<SingularityTask> tryRepairTask(SingularityTaskId taskId) {
13161316
try {
13171317
Optional<SingularityTask> maybeTask = getTask(taskId); // checks zkCache for task data
1318-
String path = getTaskPath(taskId);
1319-
if (maybeTask.isPresent() && !exists(path)) {
1320-
LOG.info("Found info for task {} from cache not in zk node, rewriting", taskId);
1321-
save(path, maybeTask.map(taskTranscoder::toBytes));
1322-
leaderCache.putActiveTask(taskId);
1318+
if (maybeTask.isPresent() && repairFoundTask(maybeTask.get())) {
1319+
return maybeTask;
13231320
}
1324-
return maybeTask;
13251321
} catch (Exception e) {
1326-
return Optional.empty();
1322+
LOG.error("Could not find or repair task data for {}", taskId, e);
1323+
}
1324+
return Optional.empty();
1325+
}
1326+
1327+
public boolean repairFoundTask(SingularityTask task) {
1328+
try {
1329+
String path = getTaskPath(task.getTaskId());
1330+
LOG.info(
1331+
"Found info for task {} from cache not in zk node, rewriting",
1332+
task.getTaskId()
1333+
);
1334+
save(path, Optional.of(taskTranscoder.toBytes(task)));
1335+
leaderCache.putActiveTask(task.getTaskId());
1336+
taskCache.set(path, task);
1337+
return true;
1338+
} catch (Exception e) {
1339+
LOG.error("Could not repair task data for {}", task.getTaskId(), e);
1340+
return false;
13271341
}
13281342
}
13291343

@@ -1406,13 +1420,6 @@ private void createTaskAndDeletePendingTaskPrivate(SingularityTask task)
14061420
// Not checking isActive here, already called within offer check flow
14071421
leaderCache.putActiveTask(task.getTaskId());
14081422
taskCache.set(path, task);
1409-
if (configuration.isVerifyTaskDataWrites()) {
1410-
Optional<SingularityTask> maybeTask = getTaskCheckCache(task.getTaskId(), true);
1411-
if (!maybeTask.isPresent()) {
1412-
LOG.error("Found empty task after write for {}", task.getTaskId());
1413-
saveTaskDeletePendingInTransaction(hasErr, path, task, taskStatusHolder);
1414-
}
1415-
}
14161423
} catch (KeeperException.NodeExistsException nee) {
14171424
LOG.error("Task or active path already existed for {}", task.getTaskId());
14181425
} catch (Exception e) {

SingularityService/src/main/java/com/hubspot/singularity/data/history/SingularityTaskHistoryPersister.java

Lines changed: 27 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
import com.hubspot.singularity.config.SingularityConfiguration;
1515
import com.hubspot.singularity.data.DeployManager;
1616
import com.hubspot.singularity.data.TaskManager;
17+
import com.hubspot.singularity.mesos.SingularitySchedulerLock;
1718
import java.util.ArrayList;
1819
import java.util.Comparator;
1920
import java.util.List;
@@ -42,6 +43,7 @@ public class SingularityTaskHistoryPersister
4243
private final DeployManager deployManager;
4344
private final HistoryManager historyManager;
4445
private final int agentReregisterTimeoutSeconds;
46+
private final SingularitySchedulerLock singularitySchedulerLock;
4547

4648
@Inject
4749
public SingularityTaskHistoryPersister(
@@ -50,6 +52,7 @@ public SingularityTaskHistoryPersister(
5052
DeployManager deployManager,
5153
HistoryManager historyManager,
5254
SingularityManagedThreadPoolFactory managedThreadPoolFactory,
55+
SingularitySchedulerLock singularitySchedulerLock,
5356
@Named(SingularityHistoryModule.PERSISTER_LOCK) ReentrantLock persisterLock,
5457
@Named(
5558
SingularityHistoryModule.LAST_TASK_PERSISTER_SUCCESS
@@ -61,6 +64,7 @@ public SingularityTaskHistoryPersister(
6164
this.deployManager = deployManager;
6265
this.agentReregisterTimeoutSeconds =
6366
configuration.getMesosConfiguration().getAgentReregisterTimeoutSeconds();
67+
this.singularitySchedulerLock = singularitySchedulerLock;
6468
}
6569

6670
@Override
@@ -88,22 +92,30 @@ public void runActionOnPoll() {
8892
() -> {
8993
try {
9094
LOG.debug("Checking request {}", requestId);
91-
List<SingularityTaskId> taskIds = taskManager.getTaskIdsForRequest(
92-
requestId
95+
List<SingularityTaskId> taskIds = singularitySchedulerLock.runWithRequestLockAndReturn(
96+
() -> {
97+
List<SingularityTaskId> ids = taskManager.getTaskIdsForRequest(
98+
requestId
99+
);
100+
ids.removeAll(taskManager.getActiveTaskIdsForRequest(requestId));
101+
ids.removeAll(taskManager.getLBCleanupTasks());
102+
List<SingularityPendingDeploy> pendingDeploys = deployManager.getPendingDeploys();
103+
ids =
104+
ids
105+
.stream()
106+
.filter(
107+
taskId ->
108+
!isPartOfPendingDeploy(pendingDeploys, taskId) &&
109+
!couldReturnWithRecoveredAgent(taskId)
110+
)
111+
.sorted(SingularityTaskId.STARTED_AT_COMPARATOR_DESC)
112+
.collect(Collectors.toList());
113+
return ids;
114+
},
115+
requestId,
116+
"task history persister fetch"
93117
);
94-
taskIds.removeAll(taskManager.getActiveTaskIdsForRequest(requestId));
95-
taskIds.removeAll(taskManager.getLBCleanupTasks());
96-
List<SingularityPendingDeploy> pendingDeploys = deployManager.getPendingDeploys();
97-
taskIds =
98-
taskIds
99-
.stream()
100-
.filter(
101-
taskId ->
102-
!isPartOfPendingDeploy(pendingDeploys, taskId) &&
103-
!couldReturnWithRecoveredAgent(taskId)
104-
)
105-
.sorted(SingularityTaskId.STARTED_AT_COMPARATOR_DESC)
106-
.collect(Collectors.toList());
118+
107119
int forRequest = 0;
108120
int transferred = 0;
109121
for (SingularityTaskId taskId : taskIds) {

0 commit comments

Comments
 (0)