Skip to content

Commit d10efc9

Browse files
authored
fix: lost data while last wal log file corrupted. (#35138)
1 parent da2d1c7 commit d10efc9

2 files changed

Lines changed: 310 additions & 14 deletions

File tree

source/libs/wal/src/walMeta.c

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -245,20 +245,16 @@ FORCE_INLINE int32_t walScanLogGetLastVer(SWal* pWal, int32_t fileIdx, int64_t*
245245
code = TSDB_CODE_WAL_LOG_NOT_EXIST;
246246
}
247247

248-
// truncate file
248+
// truncate file to remove corruption
249+
// For Raft log semantics, we must truncate at the last valid entry
250+
// because log entries must be continuous without gaps
249251
if (lastEntryEndOffset != fileSize) {
250-
if(fileIdx < sz - 1){
251-
wWarn("vgId:%d, repair meta truncate file %s to %" PRId64 ", orig size %" PRId64, pWal->cfg.vgId, fnameStr,
252-
lastEntryEndOffset, fileSize);
253-
254-
if (taosFtruncateFile(pFile, lastEntryEndOffset) < 0) {
255-
wError("vgId:%d, failed to truncate file %s since %s", pWal->cfg.vgId, fnameStr, strerror(terrno));
256-
TAOS_CHECK_GOTO(terrno, &lino, _err);
257-
}
258-
}
259-
else{
260-
wWarn("vgId:%d, skip to truncate file in repair meta %s to %" PRId64 ", orig size %" PRId64 " but fileIdx:%d is invalid",
261-
pWal->cfg.vgId, fnameStr, lastEntryEndOffset, fileSize, fileIdx);
252+
wWarn("vgId:%d, repair meta truncate file %s to %" PRId64 ", orig size %" PRId64 ", fileIdx:%d",
253+
pWal->cfg.vgId, fnameStr, lastEntryEndOffset, fileSize, fileIdx);
254+
255+
if (taosFtruncateFile(pFile, lastEntryEndOffset) < 0) {
256+
wError("vgId:%d, failed to truncate file %s since %s", pWal->cfg.vgId, fnameStr, strerror(terrno));
257+
TAOS_CHECK_GOTO(terrno, &lino, _err);
262258
}
263259

264260
if (pWal->cfg.level != TAOS_WAL_SKIP && taosFsyncFile(pFile) < 0) {

source/libs/wal/test/walMetaTest.cpp

Lines changed: 301 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1195,6 +1195,306 @@ TEST_F(WalRetentionEnv, corruptedDirDeleteLastFile) {
11951195
ASSERT_NE(pWal, nullptr);
11961196
ASSERT_EQ(pWal->vers.firstVer, 0);
11971197
ASSERT_EQ(pWal->vers.lastVer, 199);
1198-
1198+
11991199
tsWalDeleteOnCorruption = oldVal;
1200+
}
1201+
1202+
// Test for the bug: data loss after WAL corruption and recovery
1203+
// Scenario:
1204+
// 1. Write data to WAL
1205+
// 2. Simulate crash (close WAL)
1206+
// 3. Manually corrupt the last WAL file by appending garbage data
1207+
// 4. Restart and write new data
1208+
// 5. Restart again - verify new data is not lost
1209+
TEST_F(WalKeepEnv, corruptionRecoveryDataLoss) {
1210+
// Start with clean environment
1211+
walClose(pWal);
1212+
taosRemoveDir(pathName);
1213+
1214+
SWalCfg cfg = {0};
1215+
cfg.rollPeriod = -1;
1216+
cfg.segSize = -1;
1217+
cfg.retentionPeriod = 0;
1218+
cfg.retentionSize = 0;
1219+
cfg.level = TAOS_WAL_FSYNC;
1220+
pWal = walOpen(pathName, &cfg);
1221+
ASSERT_NE(pWal, nullptr);
1222+
1223+
int code;
1224+
1225+
// Step 1: Write initial data (version 0-9)
1226+
for (int i = 0; i < 10; i++) {
1227+
char newStr[100];
1228+
sprintf(newStr, "%s-%d", ranStr, i);
1229+
int len = strlen(newStr);
1230+
code = walAppendLog(pWal, i, 0, syncMeta, newStr, len, NULL);
1231+
ASSERT_EQ(code, 0);
1232+
}
1233+
1234+
// Get the log file path before closing
1235+
SWalFileInfo* pFileInfo = walGetCurFileInfo(pWal);
1236+
ASSERT_NE(pFileInfo, nullptr);
1237+
int64_t firstVer = pFileInfo->firstVer;
1238+
int64_t validFileSize = pFileInfo->fileSize;
1239+
1240+
char logFileName[256];
1241+
snprintf(logFileName, sizeof(logFileName), "%s" TD_DIRSEP "%020" PRId64 ".log", pathName, firstVer);
1242+
1243+
printf("Log file: %s, valid size: %" PRId64 "\n", logFileName, validFileSize);
1244+
1245+
// Step 2: Simulate crash - close WAL (but don't delete directory)
1246+
walClose(pWal);
1247+
pWal = NULL;
1248+
1249+
// Step 3: Manually corrupt the WAL file by appending garbage data
1250+
TdFilePtr pFile = taosOpenFile(logFileName, TD_FILE_WRITE | TD_FILE_APPEND);
1251+
if (pFile == nullptr) {
1252+
printf("Failed to open file: %s, error: %s\n", logFileName, strerror(errno));
1253+
}
1254+
ASSERT_NE(pFile, nullptr);
1255+
1256+
// Append 100 bytes of garbage data
1257+
char garbage[100];
1258+
memset(garbage, 0xFF, sizeof(garbage));
1259+
int64_t written = taosWriteFile(pFile, garbage, sizeof(garbage));
1260+
ASSERT_EQ(written, sizeof(garbage));
1261+
taosCloseFile(&pFile);
1262+
1263+
printf("Corrupted WAL file: %s (appended %d bytes of garbage)\n", logFileName, (int)sizeof(garbage));
1264+
1265+
// Step 4: Restart WAL and write new data (version 10-19)
1266+
pWal = walOpen(pathName, &cfg);
1267+
ASSERT_NE(pWal, nullptr);
1268+
1269+
// Verify that we recovered to version 9
1270+
ASSERT_EQ(pWal->vers.lastVer, 9);
1271+
1272+
// Write new data after corruption
1273+
for (int i = 10; i < 20; i++) {
1274+
char newStr[100];
1275+
sprintf(newStr, "%s-%d", ranStr, i);
1276+
int len = strlen(newStr);
1277+
code = walAppendLog(pWal, i, 0, syncMeta, newStr, len, NULL);
1278+
ASSERT_EQ(code, 0);
1279+
}
1280+
1281+
ASSERT_EQ(pWal->vers.lastVer, 19);
1282+
1283+
// Step 5: Restart again and verify all data (0-19) can be read
1284+
walClose(pWal);
1285+
pWal = walOpen(pathName, &cfg);
1286+
ASSERT_NE(pWal, nullptr);
1287+
1288+
// After restart, lastVer should still be 19
1289+
ASSERT_EQ(pWal->vers.lastVer, 19);
1290+
1291+
// Verify we can read all data from 0 to 19
1292+
SWalReader* pRead = walOpenReader(pWal, 0);
1293+
ASSERT_NE(pRead, nullptr);
1294+
1295+
for (int i = 0; i < 20; i++) {
1296+
code = walReadVer(pRead, i);
1297+
ASSERT_EQ(code, 0) << "Failed to read version " << i;
1298+
1299+
ASSERT_EQ(pRead->pHead->head.version, i);
1300+
char expectedStr[100];
1301+
sprintf(expectedStr, "%s-%d", ranStr, i);
1302+
int expectedLen = strlen(expectedStr);
1303+
ASSERT_EQ(pRead->pHead->head.bodyLen, expectedLen);
1304+
1305+
for (int j = 0; j < expectedLen; j++) {
1306+
EXPECT_EQ(expectedStr[j], pRead->pHead->head.body[j]);
1307+
}
1308+
}
1309+
1310+
walCloseReader(pRead);
1311+
1312+
printf("SUCCESS: All data (version 0-19) recovered correctly after corruption\n");
1313+
}
1314+
1315+
// Test for the bug with multiple files
1316+
// Verify that corruption in the last file doesn't affect data recovery
1317+
TEST_F(WalKeepEnv, corruptionRecoveryMultipleFiles) {
1318+
// Start with clean environment
1319+
walClose(pWal);
1320+
taosRemoveDir(pathName);
1321+
1322+
SWalCfg cfg = {0};
1323+
cfg.rollPeriod = -1;
1324+
cfg.segSize = -1;
1325+
cfg.retentionPeriod = 0;
1326+
cfg.retentionSize = 0;
1327+
cfg.level = TAOS_WAL_FSYNC;
1328+
pWal = walOpen(pathName, &cfg);
1329+
ASSERT_NE(pWal, nullptr);
1330+
1331+
int code;
1332+
1333+
// Write data to first file (version 0-99)
1334+
for (int i = 0; i < 100; i++) {
1335+
char newStr[100];
1336+
sprintf(newStr, "%s-%d", ranStr, i);
1337+
int len = strlen(newStr);
1338+
code = walAppendLog(pWal, i, 0, syncMeta, newStr, len, NULL);
1339+
ASSERT_EQ(code, 0);
1340+
}
1341+
1342+
// Roll to create second file
1343+
code = walRollImpl(pWal);
1344+
ASSERT_EQ(code, 0);
1345+
1346+
// Write data to second file (version 100-199)
1347+
for (int i = 100; i < 200; i++) {
1348+
char newStr[100];
1349+
sprintf(newStr, "%s-%d", ranStr, i);
1350+
int len = strlen(newStr);
1351+
code = walAppendLog(pWal, i, 0, syncMeta, newStr, len, NULL);
1352+
ASSERT_EQ(code, 0);
1353+
}
1354+
1355+
// Get the last log file path
1356+
SWalFileInfo* pFileInfo = walGetCurFileInfo(pWal);
1357+
ASSERT_NE(pFileInfo, nullptr);
1358+
int64_t lastFileFirstVer = pFileInfo->firstVer;
1359+
1360+
char logFileName[256];
1361+
snprintf(logFileName, sizeof(logFileName), "%s" TD_DIRSEP "%020" PRId64 ".log", pathName, lastFileFirstVer);
1362+
1363+
// Close WAL
1364+
walClose(pWal);
1365+
pWal = NULL;
1366+
1367+
// Corrupt the last file by appending garbage
1368+
TdFilePtr pFile = taosOpenFile(logFileName, TD_FILE_WRITE | TD_FILE_APPEND);
1369+
ASSERT_NE(pFile, nullptr);
1370+
1371+
char garbage[200];
1372+
memset(garbage, 0xAA, sizeof(garbage));
1373+
int64_t written = taosWriteFile(pFile, garbage, sizeof(garbage));
1374+
ASSERT_EQ(written, sizeof(garbage));
1375+
taosCloseFile(&pFile);
1376+
1377+
printf("Corrupted last WAL file: %s\n", logFileName);
1378+
1379+
// Restart and write new data
1380+
pWal = walOpen(pathName, &cfg);
1381+
ASSERT_NE(pWal, nullptr);
1382+
1383+
ASSERT_EQ(pWal->vers.lastVer, 199);
1384+
1385+
// Write new data (version 200-249)
1386+
for (int i = 200; i < 250; i++) {
1387+
char newStr[100];
1388+
sprintf(newStr, "%s-%d", ranStr, i);
1389+
int len = strlen(newStr);
1390+
code = walAppendLog(pWal, i, 0, syncMeta, newStr, len, NULL);
1391+
ASSERT_EQ(code, 0);
1392+
}
1393+
1394+
ASSERT_EQ(pWal->vers.lastVer, 249);
1395+
1396+
// Restart and verify all data
1397+
walClose(pWal);
1398+
pWal = walOpen(pathName, &cfg);
1399+
ASSERT_NE(pWal, nullptr);
1400+
1401+
ASSERT_EQ(pWal->vers.lastVer, 249);
1402+
1403+
// Verify we can read all data
1404+
SWalReader* pRead = walOpenReader(pWal, 0);
1405+
ASSERT_NE(pRead, nullptr);
1406+
1407+
for (int i = 0; i < 250; i++) {
1408+
code = walReadVer(pRead, i);
1409+
ASSERT_EQ(code, 0) << "Failed to read version " << i;
1410+
ASSERT_EQ(pRead->pHead->head.version, i);
1411+
}
1412+
1413+
walCloseReader(pRead);
1414+
1415+
printf("SUCCESS: All data recovered correctly with multiple files\n");
1416+
}
1417+
1418+
// Test behavior when corruption is in the middle of a WAL file (file size unchanged).
1419+
// The repair logic triggers a full scan when fileSize != meta fileSize (e.g. appended garbage).
1420+
// For in-place overwrites the file size matches, so the scan is skipped and lastVer stays at 19.
1421+
// Reading entries at/after the corrupted offset will fail with a checksum error.
1422+
TEST_F(WalKeepEnv, corruptionInMiddleOfFile) {
1423+
// Start with clean environment
1424+
walClose(pWal);
1425+
taosRemoveDir(pathName);
1426+
1427+
SWalCfg cfg = {0};
1428+
cfg.rollPeriod = -1;
1429+
cfg.segSize = -1;
1430+
cfg.retentionPeriod = 0;
1431+
cfg.retentionSize = 0;
1432+
cfg.level = TAOS_WAL_FSYNC;
1433+
pWal = walOpen(pathName, &cfg);
1434+
ASSERT_NE(pWal, nullptr);
1435+
1436+
int code;
1437+
1438+
// Write initial data (version 0-19)
1439+
for (int i = 0; i < 20; i++) {
1440+
char newStr[100];
1441+
sprintf(newStr, "%s-%d", ranStr, i);
1442+
int len = strlen(newStr);
1443+
code = walAppendLog(pWal, i, 0, syncMeta, newStr, len, NULL);
1444+
ASSERT_EQ(code, 0);
1445+
}
1446+
1447+
SWalFileInfo* pFileInfo = walGetCurFileInfo(pWal);
1448+
ASSERT_NE(pFileInfo, nullptr);
1449+
int64_t validFileSize = pFileInfo->fileSize;
1450+
int64_t firstVer = pFileInfo->firstVer;
1451+
1452+
char logFileName[256];
1453+
snprintf(logFileName, sizeof(logFileName), "%s" TD_DIRSEP "%020" PRId64 ".log", pathName, firstVer);
1454+
1455+
// Close WAL
1456+
walClose(pWal);
1457+
pWal = NULL;
1458+
1459+
// Corrupt by overwriting middle of file with garbage (file size unchanged)
1460+
TdFilePtr pFile = taosOpenFile(logFileName, TD_FILE_WRITE);
1461+
ASSERT_NE(pFile, nullptr);
1462+
1463+
int64_t corruptOffset = validFileSize / 2;
1464+
taosLSeekFile(pFile, corruptOffset, SEEK_SET);
1465+
1466+
char garbage[50];
1467+
memset(garbage, 0xBB, sizeof(garbage));
1468+
taosWriteFile(pFile, garbage, sizeof(garbage));
1469+
taosCloseFile(&pFile);
1470+
1471+
printf("Corrupted middle of WAL file at offset %" PRId64 "\n", corruptOffset);
1472+
1473+
// Restart - file size matches meta, so repair scan is skipped; lastVer stays at 19
1474+
pWal = walOpen(pathName, &cfg);
1475+
ASSERT_NE(pWal, nullptr);
1476+
ASSERT_EQ(pWal->vers.lastVer, 19);
1477+
1478+
// Reading entries at/after the corrupted offset will fail
1479+
SWalReader* pRead = walOpenReader(pWal, 0);
1480+
ASSERT_NE(pRead, nullptr);
1481+
1482+
// Version 0 (before corruption) should be readable
1483+
code = walReadVer(pRead, 0);
1484+
ASSERT_EQ(code, 0);
1485+
1486+
// At least one entry in the corrupted region should fail
1487+
bool foundError = false;
1488+
for (int i = 0; i < 20; i++) {
1489+
if (walReadVer(pRead, i) != 0) {
1490+
foundError = true;
1491+
printf("Read error at version %d as expected\n", i);
1492+
break;
1493+
}
1494+
}
1495+
ASSERT_TRUE(foundError) << "Expected at least one read failure due to middle corruption";
1496+
1497+
walCloseReader(pRead);
1498+
1499+
printf("SUCCESS: Middle corruption detected at read time as expected\n");
12001500
}

0 commit comments

Comments
 (0)