Skip to content

Commit b64bdd2

Browse files
authored
fix: windows pList not locked and add more info to minidump file (#35080)
1 parent 29cc3ea commit b64bdd2

11 files changed

Lines changed: 340 additions & 47 deletions

File tree

cmake/define.cmake

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,16 @@ IF(TD_WINDOWS)
145145
# /Zi : generate a separate PDB file (previously /Zi- which disabled it entirely).
146146
# The PDB is NOT shipped to the user but must be archived internally per version
147147
# so that crash dumps from the field can be symbolicated with WinDbg / VS.
148-
SET(COMMON_FLAGS "/W3 /D_WIN32 /DWIN32 /Zi /O2 /GL /MD")
148+
IF(${BUILD_SANITIZER})
149+
MESSAGE("${Green} will build with AddressSanitizer (MSVC ASan)! ${ColourReset}")
150+
# /fsanitize=address: MSVC ASan (incompatible with /GL and /O2)
151+
# _DISABLE_VECTOR_ANNOTATION/_DISABLE_STRING_ANNOTATION: suppress STL ASan
152+
# annotations to avoid LNK2038 mismatch with pre-built libs (e.g. rocksdb)
153+
# that were compiled without /fsanitize=address.
154+
SET(COMMON_FLAGS "/W3 /D_WIN32 /DWIN32 /Zi /O1 /MD /fsanitize=address /D_DISABLE_VECTOR_ANNOTATION=1 /D_DISABLE_STRING_ANNOTATION=1")
155+
ELSE()
156+
SET(COMMON_FLAGS "/W3 /D_WIN32 /DWIN32 /Zi /O2 /GL /MD")
157+
ENDIF()
149158
ELSE()
150159
MESSAGE("${Green} will build Debug version! ${ColourReset}")
151160
# NOTE: let cmake to choose default compile options

source/dnode/mgmt/exe/dmMain.c

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,7 @@ void dmLogCrash(int signum, void *sigInfo, void *context) {
190190
// taosIgnSignal(SIGHUP);
191191
// taosIgnSignal(SIGINT);
192192
// taosIgnSignal(SIGBREAK);
193+
dInfo("crash signal is %d", signum);
193194

194195
#ifndef WINDOWS
195196
if (taosIgnSignal(SIGBUS) != 0) {
@@ -200,17 +201,22 @@ void dmLogCrash(int signum, void *sigInfo, void *context) {
200201
dWarn("failed to ignore signal SIGABRT");
201202
}
202203
if (taosIgnSignal(SIGFPE) != 0) {
203-
dWarn("failed to ignore signal SIGABRT");
204+
dWarn("failed to ignore signal SIGFPE");
204205
}
205206
if (taosIgnSignal(SIGSEGV) != 0) {
206-
dWarn("failed to ignore signal SIGABRT");
207+
dWarn("failed to ignore signal SIGSEGV");
207208
}
208209
#ifdef USE_REPORT
209210
writeCrashLogToFile(signum, sigInfo, CUS_PROMPT "d", dmGetClusterId(), global.startTime);
210211
#endif
211212
#ifdef _TD_DARWIN_64
212213
exit(signum);
213214
#elif defined(WINDOWS)
215+
// On Windows, restore default signal handler and re-raise to trigger SEH/FlCrashDump
216+
// This allows the UnhandledExceptionFilter to generate a proper minidump
217+
signal(signum, SIG_DFL);
218+
raise(signum);
219+
// If raise() returns (shouldn't happen), fall through to exit
214220
exit(signum);
215221
#endif
216222
}

source/dnode/mnode/impl/src/mndStreamMgmt.c

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -768,7 +768,13 @@ int32_t msmBuildReaderDeployInfo(SStmTaskDeploy* pDeploy, void* calcScanPlan, SS
768768
} else {
769769
SStreamReaderDeployFromCalc* pCalc = &pMsg->msg.calc;
770770
pCalc->execReplica = pInfo->runnerDeploys * pInfo->runnerReplica;
771-
pCalc->calcScanPlan = calcScanPlan;
771+
if (calcScanPlan) {
772+
pCalc->calcScanPlan = taosStrdup(calcScanPlan);
773+
if (NULL == pCalc->calcScanPlan) {
774+
return terrno;
775+
}
776+
pCalc->freeScanPlan = true;
777+
}
772778
}
773779

774780
return TSDB_CODE_SUCCESS;
@@ -2026,14 +2032,19 @@ static int32_t msmUpdateCalcReaderTasks(SStreamObj* pStream, SNodeList* pSubEP)
20262032

20272033
for (int32_t i = 0; i < taskNum; ++i) {
20282034
SStmTaskToDeployExt* pExt = taosArrayGet(pVg->taskList, i);
2029-
if (pExt->deploy.task.streamId != streamId || STREAM_READER_TASK != pExt->deploy.task.type) {
2035+
if (pExt->deployed || pExt->deploy.task.streamId != streamId || STREAM_READER_TASK != pExt->deploy.task.type) {
20302036
continue;
20312037
}
20322038

20332039
if (!pExt->deploy.msg.reader.triggerReader) {
20342040
SStreamReaderDeployFromCalc* pCalcReaderDeploy = &pExt->deploy.msg.reader.msg.calc;
20352041
TAOS_CHECK_EXIT(nodesStringToNode(pCalcReaderDeploy->calcScanPlan, (SNode**)&pSubplan));
20362042
TAOS_CHECK_EXIT(nodesCloneList(pSubEP, &pSubplan->pSubQ));
2043+
2044+
// Free old calcScanPlan before nodesNodeToString overwrites the pointer
2045+
if (pCalcReaderDeploy->freeScanPlan) {
2046+
taosMemoryFreeClear(pCalcReaderDeploy->calcScanPlan);
2047+
}
20372048
TAOS_CHECK_EXIT(nodesNodeToString((SNode*)pSubplan, false, (char**)&pCalcReaderDeploy->calcScanPlan, NULL));
20382049
pCalcReaderDeploy->freeScanPlan = true;
20392050
nodesDestroyNode((SNode *)pSubplan);

source/dnode/mnode/impl/src/mndStreamUtil.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,11 +75,13 @@ void mstDestroySStmTaskToDeployExt(void* param) {
7575
case STREAM_READER_TASK:
7676
if (!pExt->deploy.msg.reader.triggerReader) {
7777
SStreamReaderDeployFromCalc* pCalcReaderDeploy = &pExt->deploy.msg.reader.msg.calc;
78-
taosMemoryFreeClear(pCalcReaderDeploy->calcScanPlan);
78+
if (pCalcReaderDeploy->freeScanPlan) {
79+
taosMemoryFreeClear(pCalcReaderDeploy->calcScanPlan);
80+
}
7981
}
8082
break;
8183
default:
82-
break;;
84+
break;
8385
}
8486
}
8587

source/libs/decimal/src/detail/CMakeLists.txt

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,16 @@ aux_source_directory(. WIDE_INTEGER_SRC)
44
SET(CMAKE_CXX_STANDARD 14)
55
add_library(wideInteger STATIC ${WIDE_INTEGER_SRC})
66

7+
# When building with MSVC ASan (/fsanitize=address), suppress STL container
8+
# annotations to avoid LNK2038 mismatch with pre-built libs (e.g. rocksdb)
9+
# that were compiled without /fsanitize=address.
10+
if(TD_WINDOWS)
11+
target_compile_definitions(wideInteger PRIVATE
12+
_DISABLE_VECTOR_ANNOTATION=1
13+
_DISABLE_STRING_ANNOTATION=1
14+
)
15+
endif()
16+
717
target_include_directories(
818
wideInteger
919
PUBLIC "${TD_SOURCE_DIR}/source/libs/decimal/inc/"

source/libs/new-stream/inc/streamTriggerTask.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -224,11 +224,12 @@ typedef struct SSTriggerRealtimeContext {
224224
SSHashObj *pGroupColVals; // SSHashObj<gid, SArray<SStreamGroupValue>*>
225225

226226
// these fields need to be cleared each round
227+
bool needCheckAgain;
227228
SSHashObj *pSlices; // SSHashObj<uid, SSTriggerDataSlice>
229+
SObjList dumpTableUids; // SObjList<{uid, vgId}>, backup ids for repeated check in one round
228230
// these fields are shared by all groups and need to reset for each group
229231
bool needPseudoCols;
230232
bool needMergeWindow;
231-
bool needCheckAgain;
232233
SSTriggerNewTimestampSorter *pSorter;
233234
SSTriggerNewVtableMerger *pMerger;
234235
SArray *pParentWindows; // SArray<SSTriggerNotifyWindow>, valid parent windows in this round

source/libs/new-stream/src/streamTriggerTask.c

Lines changed: 72 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -3618,6 +3618,8 @@ static int32_t stRealtimeContextInit(SSTriggerRealtimeContext *pContext, SStream
36183618

36193619
pContext->pSlices = tSimpleHashInit(256, taosGetDefaultHashFunction(TSDB_DATA_TYPE_BIGINT));
36203620
QUERY_CHECK_NULL(pContext->pSlices, code, lino, _end, terrno);
3621+
code = taosObjListInit(&pContext->dumpTableUids, &pContext->tableUidPool);
3622+
QUERY_CHECK_CODE(code, lino, _end);
36213623
if (pTask->isVirtualTable) {
36223624
code = stRealtimeContextInitPatchContext(pContext);
36233625
QUERY_CHECK_CODE(code, lino, _end);
@@ -3810,6 +3812,7 @@ static void stRealtimeContextDestroy(void *ptr) {
38103812
tSimpleHashCleanup(pContext->pSlices);
38113813
pContext->pSlices = NULL;
38123814
}
3815+
taosObjListClear(&pContext->dumpTableUids);
38133816
stRealtimeContextDestroyPatchContext(pContext);
38143817

38153818
if (pContext->pSorter != NULL) {
@@ -4340,7 +4343,7 @@ static int32_t stRealtimeContextSendPullReq(SSTriggerRealtimeContext *pContext,
43404343

43414344
// send STRIGGER_PULL_GROUP_COL_VALUE for given (pProgress, gid); used when pulling groupInfo for pending create-table
43424345
static int32_t stRealtimeContextSendPullReqForGid(SSTriggerRealtimeContext *pContext, SSTriggerWalProgress *pProgress,
4343-
int64_t gid) {
4346+
int64_t gid) {
43444347
SStreamTriggerTask *pTask = pContext->pTask;
43454348
SSTriggerPullRequest *pReq = &pProgress->pullReq.base;
43464349
SStreamTaskAddr *pReader = pProgress->pTaskAddr;
@@ -5211,6 +5214,31 @@ static int32_t stRealtimeContextCheckIdleGroup(SSTriggerRealtimeContext *pContex
52115214
return code;
52125215
}
52135216

5217+
static int32_t stRealtimeContextCopyTableUids(SSTriggerRealtimeContext *pContext, SObjList *pSrc, SObjList *pDst) {
5218+
int32_t code = TSDB_CODE_SUCCESS;
5219+
int32_t lino = 0;
5220+
int64_t *id = NULL;
5221+
SObjListIter iter = {0};
5222+
SStreamTriggerTask *pTask = pContext->pTask;
5223+
5224+
if (pSrc == NULL || pDst == NULL || pSrc->neles == 0) {
5225+
return TSDB_CODE_SUCCESS;
5226+
}
5227+
5228+
taosObjListClear(pDst);
5229+
taosObjListInitIter(pSrc, &iter, TOBJLIST_ITER_FORWARD);
5230+
while ((id = taosObjListIterNext(&iter)) != NULL) {
5231+
code = taosObjListAppend(pDst, id);
5232+
QUERY_CHECK_CODE(code, lino, _end);
5233+
}
5234+
5235+
_end:
5236+
if (code != TSDB_CODE_SUCCESS) {
5237+
ST_TASK_ELOG("%s failed at line %d since %s", __func__, lino, tstrerror(code));
5238+
}
5239+
return code;
5240+
}
5241+
52145242
static int32_t stRealtimeContextCheck(SSTriggerRealtimeContext *pContext) {
52155243
int32_t code = TSDB_CODE_SUCCESS;
52165244
int32_t lino = 0;
@@ -5371,20 +5399,20 @@ static int32_t stRealtimeContextCheck(SSTriggerRealtimeContext *pContext) {
53715399
}
53725400
}
53735401

5374-
// Check for idle groups
5375-
code = stRealtimeContextCheckIdleGroup(pContext);
5376-
QUERY_CHECK_CODE(code, lino, _end);
5377-
5378-
while (TD_DLIST_NELES(&pContext->groupsToCheck) > 0) {
5402+
_check:
5403+
while (TD_DLIST_NELES(&pContext->groupsToCheck) > 0 && pContext->status != STRIGGER_CONTEXT_ACQUIRE_REQUEST &&
5404+
pContext->status != STRIGGER_CONTEXT_SEND_CALC_REQ) {
53795405
SSTriggerRealtimeGroup *pGroup = TD_DLIST_HEAD(&pContext->groupsToCheck);
53805406
switch (pContext->status) {
53815407
case STRIGGER_CONTEXT_FETCH_META: {
5382-
pContext->status = STRIGGER_CONTEXT_ACQUIRE_REQUEST;
5383-
}
5384-
case STRIGGER_CONTEXT_ACQUIRE_REQUEST: {
53855408
pContext->status = STRIGGER_CONTEXT_CHECK_CONDITION;
53865409
}
53875410
case STRIGGER_CONTEXT_CHECK_CONDITION: {
5411+
if (pTask->triggerType == STREAM_TRIGGER_SLIDING) {
5412+
// only sliding trigger may need to check again
5413+
code = stRealtimeContextCopyTableUids(pContext, &pGroup->tableUids, &pContext->dumpTableUids);
5414+
QUERY_CHECK_CODE(code, lino, _end);
5415+
}
53885416
code = stRealtimeGroupCheck(pGroup);
53895417
QUERY_CHECK_CODE(code, lino, _end);
53905418
if (pContext->needPseudoCols) {
@@ -5409,18 +5437,29 @@ static int32_t stRealtimeContextCheck(SSTriggerRealtimeContext *pContext) {
54095437
QUERY_CHECK_CODE(code, lino, _end);
54105438
}
54115439
}
5412-
if (!pContext->needCheckAgain) {
5413-
stRealtimeGroupClearMetadatas(pGroup);
5414-
TD_DLIST_POP(&pContext->groupsToCheck, pGroup);
5415-
code = stTriggerTaskReadyRecalcRequest(pTask, pGroup);
5440+
if (pContext->needCheckAgain && pTask->triggerType == STREAM_TRIGGER_SLIDING) {
5441+
code = stRealtimeContextCopyTableUids(pContext, &pContext->dumpTableUids, &pGroup->tableUids);
54165442
QUERY_CHECK_CODE(code, lino, _end);
5443+
} else {
5444+
TD_DLIST_POP(&pContext->groupsToCheck, pGroup);
5445+
stRealtimeGroupClearMetadatas(pGroup);
5446+
}
5447+
pContext->needCheckAgain = false;
5448+
taosObjListClear(&pContext->dumpTableUids);
5449+
code = stTriggerTaskReadyRecalcRequest(pTask, pGroup);
5450+
QUERY_CHECK_CODE(code, lino, _end);
5451+
pContext->status = STRIGGER_CONTEXT_FETCH_META;
5452+
if (pGroup->pPendingCalcParams.neles >= STREAM_CALC_REQ_MAX_WIN_NUM ||
5453+
pContext->calcParamPool.size >= STREAM_TRIGGER_MAX_PENDING_PARAMS) {
5454+
break;
54175455
}
5418-
pContext->status = STRIGGER_CONTEXT_ACQUIRE_REQUEST;
54195456
}
54205457

54215458
if (pContext->pMinGroup == NULL && pContext->pMaxDelayHeap->min != NULL) {
54225459
pContext->pMinGroup = container_of(pContext->pMaxDelayHeap->min, SSTriggerRealtimeGroup, heapNode);
5423-
if (pContext->pMinGroup->nextExecTime > now) {
5460+
if (pContext->pMinGroup->nextExecTime > now &&
5461+
pContext->pMinGroup->pPendingCalcParams.neles < STREAM_CALC_REQ_MAX_WIN_NUM &&
5462+
pContext->calcParamPool.size < STREAM_TRIGGER_MAX_PENDING_PARAMS) {
54245463
pContext->pMinGroup = NULL;
54255464
}
54265465
}
@@ -5465,7 +5504,7 @@ static int32_t stRealtimeContextCheck(SSTriggerRealtimeContext *pContext) {
54655504
// calc req has not been set
54665505
goto _end;
54675506
}
5468-
if (pTask->placeHolderBitmap & PLACE_HOLDER_PARTITION_ROWS) {
5507+
if (pTask->placeHolderBitmap & PLACE_HOLDER_PARTITION_ROWS && !IS_TRIGGER_GROUP_TO_CHECK(pGroup)) {
54695508
stRealtimeGroupClearMetadatas(pGroup);
54705509
}
54715510
stRealtimeGroupClearTempState(pGroup);
@@ -5490,7 +5529,9 @@ static int32_t stRealtimeContextCheck(SSTriggerRealtimeContext *pContext) {
54905529
pContext->status = STRIGGER_CONTEXT_ACQUIRE_REQUEST;
54915530
if (pContext->pMaxDelayHeap->min != NULL) {
54925531
pContext->pMinGroup = container_of(pContext->pMaxDelayHeap->min, SSTriggerRealtimeGroup, heapNode);
5493-
if (pContext->pMinGroup->nextExecTime > now) {
5532+
if (pContext->pMinGroup->nextExecTime > now &&
5533+
pContext->pMinGroup->pPendingCalcParams.neles < STREAM_CALC_REQ_MAX_WIN_NUM &&
5534+
pContext->calcParamPool.size < STREAM_TRIGGER_MAX_PENDING_PARAMS) {
54945535
pContext->pMinGroup = NULL;
54955536
}
54965537
} else {
@@ -5507,6 +5548,11 @@ static int32_t stRealtimeContextCheck(SSTriggerRealtimeContext *pContext) {
55075548
}
55085549
}
55095550

5551+
if (TD_DLIST_NELES(&pContext->groupsToCheck) > 0) {
5552+
pContext->status = STRIGGER_CONTEXT_CHECK_CONDITION;
5553+
goto _check;
5554+
}
5555+
55105556
int32_t deleteGroupNum = taosArrayGetSize(pContext->groupsToDelete);
55115557
if (deleteGroupNum > 0) {
55125558
pContext->status = STRIGGER_CONTEXT_SEND_DROP_REQ;
@@ -6121,6 +6167,9 @@ static int32_t stRealtimeContextProcPullRsp(SSTriggerRealtimeContext *pContext,
61216167
if (latestVersionTime != INT64_MAX) {
61226168
atomic_store_64(&pTask->latestVersionTime, latestVersionTime);
61236169
}
6170+
// Check for idle groups
6171+
code = stRealtimeContextCheckIdleGroup(pContext);
6172+
QUERY_CHECK_CODE(code, lino, _end);
61246173

61256174
if (pContext->recovering && recoveryDone) {
61266175
ST_TASK_DLOG("stop fetch wal metas since recovery is done, pool size: %" PRId64, pContext->metaPool.size);
@@ -6384,6 +6433,9 @@ static int32_t stRealtimeContextProcPullRsp(SSTriggerRealtimeContext *pContext,
63846433
if (latestVersionTime != INT64_MAX) {
63856434
atomic_store_64(&pTask->latestVersionTime, latestVersionTime);
63866435
}
6436+
// Check for idle groups
6437+
code = stRealtimeContextCheckIdleGroup(pContext);
6438+
QUERY_CHECK_CODE(code, lino, _end);
63876439
}
63886440

63896441
pContext->catchUp = (TD_DLIST_NELES(&pContext->groupsToCheck) == 0);
@@ -9001,7 +9053,6 @@ static void stRealtimeGroupClearTempState(SSTriggerRealtimeGroup *pGroup) {
90019053

90029054
pContext->needPseudoCols = false;
90039055
pContext->needMergeWindow = false;
9004-
pContext->needCheckAgain = false;
90059056
if (pContext->pSorter != NULL) {
90069057
stNewTimestampSorterReset(pContext->pSorter);
90079058
}
@@ -9100,7 +9151,7 @@ static int32_t stRealtimeGroupAddMeta(SSTriggerRealtimeGroup *pGroup, int32_t vg
91009151
SObjList *pMetas = NULL;
91019152

91029153
// Update idle trigger timestamps when receiving data
9103-
if (pTask->idleTimeoutMs > 0) {
9154+
if (pTask->idleTimeoutMs > 0 && !pContext->recovering) {
91049155
int64_t prevRecvTimeMono = pGroup->lastRecvTimeMono;
91059156
int64_t prevRecvTimeWall = pGroup->lastRecvTimeWall;
91069157
pGroup->lastRecvTimeMono = taosGetMonoTimestampMs();
@@ -9425,10 +9476,9 @@ static int32_t stRealtimeGroupDoSlidingCheck(SSTriggerRealtimeGroup *pGroup) {
94259476
while (newWin.range.skey <= pGroup->newThreshold) {
94269477
void *px = taosArrayPush(pContext->pWindows, &newWin);
94279478
QUERY_CHECK_NULL(px, code, lino, _end, terrno);
9428-
if (pContext->walMode == STRIGGER_WAL_META_ONLY &&
9429-
TARRAY_SIZE(pContext->pWindows) >= STREAM_CALC_REQ_MAX_WIN_NUM) {
9479+
if (TARRAY_SIZE(pContext->pWindows) >= STREAM_CALC_REQ_MAX_WIN_NUM) {
94309480
pContext->needCheckAgain = true;
9431-
goto _end;
9481+
break;
94329482
}
94339483
stTriggerTaskNextTimeWindow(pTask, &newWin.range);
94349484
}

source/libs/scheduler/src/schTask.c

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1097,18 +1097,29 @@ int32_t schLaunchRemoteTask(SSchJob *pJob, SSchTask *pTask) {
10971097
if (NULL == pTask->msg) { // TODO add more detailed reason for failure
10981098
SCH_LOCK(SCH_WRITE, &pTask->planLock);
10991099
code = qSubPlanToMsg(plan, &pTask->msg, &pTask->msgLen);
1100+
if (TSDB_CODE_SUCCESS == code && tsQueryPlannerTrace) {
1101+
if (SUBPLAN_TYPE_MODIFY == plan->subplanType) {
1102+
SDataInserterNode *insert = (SDataInserterNode *)plan->pDataSink;
1103+
SCH_TASK_DLOG("MODIFY plan, tables:%d, payload size:%u", insert ? insert->numOfTables : 0,
1104+
insert ? insert->size : 0);
1105+
} else {
1106+
char *msg = NULL;
1107+
int32_t msgLen = 0;
1108+
int32_t traceCode = qSubPlanToString(plan, &msg, &msgLen);
1109+
if (TSDB_CODE_SUCCESS == traceCode) {
1110+
SCH_TASK_DLOGL("physical plan len:%d, %s", msgLen, msg);
1111+
taosMemoryFree(msg);
1112+
} else {
1113+
SCH_TASK_WLOG("plan trace failed, code:%s", tstrerror(traceCode));
1114+
}
1115+
}
1116+
}
11001117
SCH_UNLOCK(SCH_WRITE, &pTask->planLock);
11011118

11021119
if (TSDB_CODE_SUCCESS != code) {
11031120
SCH_TASK_ELOG("failed to create physical plan, code:%s, msg:%p, len:%d", tstrerror(code), pTask->msg,
11041121
pTask->msgLen);
11051122
SCH_ERR_RET(code);
1106-
} else if (tsQueryPlannerTrace) {
1107-
char *msg = NULL;
1108-
int32_t msgLen = 0;
1109-
SCH_ERR_RET(qSubPlanToString(plan, &msg, &msgLen));
1110-
SCH_TASK_DLOGL("physical plan len:%d, %s", msgLen, msg);
1111-
taosMemoryFree(msg);
11121123
}
11131124
}
11141125

0 commit comments

Comments
 (0)