@@ -1195,6 +1195,306 @@ TEST_F(WalRetentionEnv, corruptedDirDeleteLastFile) {
11951195 ASSERT_NE (pWal, nullptr );
11961196 ASSERT_EQ (pWal->vers .firstVer , 0 );
11971197 ASSERT_EQ (pWal->vers .lastVer , 199 );
1198-
1198+
11991199 tsWalDeleteOnCorruption = oldVal;
1200+ }
1201+
1202+ // Test for the bug: data loss after WAL corruption and recovery
1203+ // Scenario:
1204+ // 1. Write data to WAL
1205+ // 2. Simulate crash (close WAL)
1206+ // 3. Manually corrupt the last WAL file by appending garbage data
1207+ // 4. Restart and write new data
1208+ // 5. Restart again - verify new data is not lost
1209+ TEST_F (WalKeepEnv, corruptionRecoveryDataLoss) {
1210+ // Start with clean environment
1211+ walClose (pWal);
1212+ taosRemoveDir (pathName);
1213+
1214+ SWalCfg cfg = {0 };
1215+ cfg.rollPeriod = -1 ;
1216+ cfg.segSize = -1 ;
1217+ cfg.retentionPeriod = 0 ;
1218+ cfg.retentionSize = 0 ;
1219+ cfg.level = TAOS_WAL_FSYNC;
1220+ pWal = walOpen (pathName, &cfg);
1221+ ASSERT_NE (pWal, nullptr );
1222+
1223+ int code;
1224+
1225+ // Step 1: Write initial data (version 0-9)
1226+ for (int i = 0 ; i < 10 ; i++) {
1227+ char newStr[100 ];
1228+ sprintf (newStr, " %s-%d" , ranStr, i);
1229+ int len = strlen (newStr);
1230+ code = walAppendLog (pWal, i, 0 , syncMeta, newStr, len, NULL );
1231+ ASSERT_EQ (code, 0 );
1232+ }
1233+
1234+ // Get the log file path before closing
1235+ SWalFileInfo* pFileInfo = walGetCurFileInfo (pWal);
1236+ ASSERT_NE (pFileInfo, nullptr );
1237+ int64_t firstVer = pFileInfo->firstVer ;
1238+ int64_t validFileSize = pFileInfo->fileSize ;
1239+
1240+ char logFileName[256 ];
1241+ snprintf (logFileName, sizeof (logFileName), " %s" TD_DIRSEP " %020" PRId64 " .log" , pathName, firstVer);
1242+
1243+ printf (" Log file: %s, valid size: %" PRId64 " \n " , logFileName, validFileSize);
1244+
1245+ // Step 2: Simulate crash - close WAL (but don't delete directory)
1246+ walClose (pWal);
1247+ pWal = NULL ;
1248+
1249+ // Step 3: Manually corrupt the WAL file by appending garbage data
1250+ TdFilePtr pFile = taosOpenFile (logFileName, TD_FILE_WRITE | TD_FILE_APPEND);
1251+ if (pFile == nullptr ) {
1252+ printf (" Failed to open file: %s, error: %s\n " , logFileName, strerror (errno));
1253+ }
1254+ ASSERT_NE (pFile, nullptr );
1255+
1256+ // Append 100 bytes of garbage data
1257+ char garbage[100 ];
1258+ memset (garbage, 0xFF , sizeof (garbage));
1259+ int64_t written = taosWriteFile (pFile, garbage, sizeof (garbage));
1260+ ASSERT_EQ (written, sizeof (garbage));
1261+ taosCloseFile (&pFile);
1262+
1263+ printf (" Corrupted WAL file: %s (appended %d bytes of garbage)\n " , logFileName, (int )sizeof (garbage));
1264+
1265+ // Step 4: Restart WAL and write new data (version 10-19)
1266+ pWal = walOpen (pathName, &cfg);
1267+ ASSERT_NE (pWal, nullptr );
1268+
1269+ // Verify that we recovered to version 9
1270+ ASSERT_EQ (pWal->vers .lastVer , 9 );
1271+
1272+ // Write new data after corruption
1273+ for (int i = 10 ; i < 20 ; i++) {
1274+ char newStr[100 ];
1275+ sprintf (newStr, " %s-%d" , ranStr, i);
1276+ int len = strlen (newStr);
1277+ code = walAppendLog (pWal, i, 0 , syncMeta, newStr, len, NULL );
1278+ ASSERT_EQ (code, 0 );
1279+ }
1280+
1281+ ASSERT_EQ (pWal->vers .lastVer , 19 );
1282+
1283+ // Step 5: Restart again and verify all data (0-19) can be read
1284+ walClose (pWal);
1285+ pWal = walOpen (pathName, &cfg);
1286+ ASSERT_NE (pWal, nullptr );
1287+
1288+ // After restart, lastVer should still be 19
1289+ ASSERT_EQ (pWal->vers .lastVer , 19 );
1290+
1291+ // Verify we can read all data from 0 to 19
1292+ SWalReader* pRead = walOpenReader (pWal, 0 );
1293+ ASSERT_NE (pRead, nullptr );
1294+
1295+ for (int i = 0 ; i < 20 ; i++) {
1296+ code = walReadVer (pRead, i);
1297+ ASSERT_EQ (code, 0 ) << " Failed to read version " << i;
1298+
1299+ ASSERT_EQ (pRead->pHead ->head .version , i);
1300+ char expectedStr[100 ];
1301+ sprintf (expectedStr, " %s-%d" , ranStr, i);
1302+ int expectedLen = strlen (expectedStr);
1303+ ASSERT_EQ (pRead->pHead ->head .bodyLen , expectedLen);
1304+
1305+ for (int j = 0 ; j < expectedLen; j++) {
1306+ EXPECT_EQ (expectedStr[j], pRead->pHead ->head .body [j]);
1307+ }
1308+ }
1309+
1310+ walCloseReader (pRead);
1311+
1312+ printf (" SUCCESS: All data (version 0-19) recovered correctly after corruption\n " );
1313+ }
1314+
1315+ // Test for the bug with multiple files
1316+ // Verify that corruption in the last file doesn't affect data recovery
1317+ TEST_F (WalKeepEnv, corruptionRecoveryMultipleFiles) {
1318+ // Start with clean environment
1319+ walClose (pWal);
1320+ taosRemoveDir (pathName);
1321+
1322+ SWalCfg cfg = {0 };
1323+ cfg.rollPeriod = -1 ;
1324+ cfg.segSize = -1 ;
1325+ cfg.retentionPeriod = 0 ;
1326+ cfg.retentionSize = 0 ;
1327+ cfg.level = TAOS_WAL_FSYNC;
1328+ pWal = walOpen (pathName, &cfg);
1329+ ASSERT_NE (pWal, nullptr );
1330+
1331+ int code;
1332+
1333+ // Write data to first file (version 0-99)
1334+ for (int i = 0 ; i < 100 ; i++) {
1335+ char newStr[100 ];
1336+ sprintf (newStr, " %s-%d" , ranStr, i);
1337+ int len = strlen (newStr);
1338+ code = walAppendLog (pWal, i, 0 , syncMeta, newStr, len, NULL );
1339+ ASSERT_EQ (code, 0 );
1340+ }
1341+
1342+ // Roll to create second file
1343+ code = walRollImpl (pWal);
1344+ ASSERT_EQ (code, 0 );
1345+
1346+ // Write data to second file (version 100-199)
1347+ for (int i = 100 ; i < 200 ; i++) {
1348+ char newStr[100 ];
1349+ sprintf (newStr, " %s-%d" , ranStr, i);
1350+ int len = strlen (newStr);
1351+ code = walAppendLog (pWal, i, 0 , syncMeta, newStr, len, NULL );
1352+ ASSERT_EQ (code, 0 );
1353+ }
1354+
1355+ // Get the last log file path
1356+ SWalFileInfo* pFileInfo = walGetCurFileInfo (pWal);
1357+ ASSERT_NE (pFileInfo, nullptr );
1358+ int64_t lastFileFirstVer = pFileInfo->firstVer ;
1359+
1360+ char logFileName[256 ];
1361+ snprintf (logFileName, sizeof (logFileName), " %s" TD_DIRSEP " %020" PRId64 " .log" , pathName, lastFileFirstVer);
1362+
1363+ // Close WAL
1364+ walClose (pWal);
1365+ pWal = NULL ;
1366+
1367+ // Corrupt the last file by appending garbage
1368+ TdFilePtr pFile = taosOpenFile (logFileName, TD_FILE_WRITE | TD_FILE_APPEND);
1369+ ASSERT_NE (pFile, nullptr );
1370+
1371+ char garbage[200 ];
1372+ memset (garbage, 0xAA , sizeof (garbage));
1373+ int64_t written = taosWriteFile (pFile, garbage, sizeof (garbage));
1374+ ASSERT_EQ (written, sizeof (garbage));
1375+ taosCloseFile (&pFile);
1376+
1377+ printf (" Corrupted last WAL file: %s\n " , logFileName);
1378+
1379+ // Restart and write new data
1380+ pWal = walOpen (pathName, &cfg);
1381+ ASSERT_NE (pWal, nullptr );
1382+
1383+ ASSERT_EQ (pWal->vers .lastVer , 199 );
1384+
1385+ // Write new data (version 200-249)
1386+ for (int i = 200 ; i < 250 ; i++) {
1387+ char newStr[100 ];
1388+ sprintf (newStr, " %s-%d" , ranStr, i);
1389+ int len = strlen (newStr);
1390+ code = walAppendLog (pWal, i, 0 , syncMeta, newStr, len, NULL );
1391+ ASSERT_EQ (code, 0 );
1392+ }
1393+
1394+ ASSERT_EQ (pWal->vers .lastVer , 249 );
1395+
1396+ // Restart and verify all data
1397+ walClose (pWal);
1398+ pWal = walOpen (pathName, &cfg);
1399+ ASSERT_NE (pWal, nullptr );
1400+
1401+ ASSERT_EQ (pWal->vers .lastVer , 249 );
1402+
1403+ // Verify we can read all data
1404+ SWalReader* pRead = walOpenReader (pWal, 0 );
1405+ ASSERT_NE (pRead, nullptr );
1406+
1407+ for (int i = 0 ; i < 250 ; i++) {
1408+ code = walReadVer (pRead, i);
1409+ ASSERT_EQ (code, 0 ) << " Failed to read version " << i;
1410+ ASSERT_EQ (pRead->pHead ->head .version , i);
1411+ }
1412+
1413+ walCloseReader (pRead);
1414+
1415+ printf (" SUCCESS: All data recovered correctly with multiple files\n " );
1416+ }
1417+
1418+ // Test behavior when corruption is in the middle of a WAL file (file size unchanged).
1419+ // The repair logic triggers a full scan when fileSize != meta fileSize (e.g. appended garbage).
1420+ // For in-place overwrites the file size matches, so the scan is skipped and lastVer stays at 19.
1421+ // Reading entries at/after the corrupted offset will fail with a checksum error.
1422+ TEST_F (WalKeepEnv, corruptionInMiddleOfFile) {
1423+ // Start with clean environment
1424+ walClose (pWal);
1425+ taosRemoveDir (pathName);
1426+
1427+ SWalCfg cfg = {0 };
1428+ cfg.rollPeriod = -1 ;
1429+ cfg.segSize = -1 ;
1430+ cfg.retentionPeriod = 0 ;
1431+ cfg.retentionSize = 0 ;
1432+ cfg.level = TAOS_WAL_FSYNC;
1433+ pWal = walOpen (pathName, &cfg);
1434+ ASSERT_NE (pWal, nullptr );
1435+
1436+ int code;
1437+
1438+ // Write initial data (version 0-19)
1439+ for (int i = 0 ; i < 20 ; i++) {
1440+ char newStr[100 ];
1441+ sprintf (newStr, " %s-%d" , ranStr, i);
1442+ int len = strlen (newStr);
1443+ code = walAppendLog (pWal, i, 0 , syncMeta, newStr, len, NULL );
1444+ ASSERT_EQ (code, 0 );
1445+ }
1446+
1447+ SWalFileInfo* pFileInfo = walGetCurFileInfo (pWal);
1448+ ASSERT_NE (pFileInfo, nullptr );
1449+ int64_t validFileSize = pFileInfo->fileSize ;
1450+ int64_t firstVer = pFileInfo->firstVer ;
1451+
1452+ char logFileName[256 ];
1453+ snprintf (logFileName, sizeof (logFileName), " %s" TD_DIRSEP " %020" PRId64 " .log" , pathName, firstVer);
1454+
1455+ // Close WAL
1456+ walClose (pWal);
1457+ pWal = NULL ;
1458+
1459+ // Corrupt by overwriting middle of file with garbage (file size unchanged)
1460+ TdFilePtr pFile = taosOpenFile (logFileName, TD_FILE_WRITE);
1461+ ASSERT_NE (pFile, nullptr );
1462+
1463+ int64_t corruptOffset = validFileSize / 2 ;
1464+ taosLSeekFile (pFile, corruptOffset, SEEK_SET);
1465+
1466+ char garbage[50 ];
1467+ memset (garbage, 0xBB , sizeof (garbage));
1468+ taosWriteFile (pFile, garbage, sizeof (garbage));
1469+ taosCloseFile (&pFile);
1470+
1471+ printf (" Corrupted middle of WAL file at offset %" PRId64 " \n " , corruptOffset);
1472+
1473+ // Restart - file size matches meta, so repair scan is skipped; lastVer stays at 19
1474+ pWal = walOpen (pathName, &cfg);
1475+ ASSERT_NE (pWal, nullptr );
1476+ ASSERT_EQ (pWal->vers .lastVer , 19 );
1477+
1478+ // Reading entries at/after the corrupted offset will fail
1479+ SWalReader* pRead = walOpenReader (pWal, 0 );
1480+ ASSERT_NE (pRead, nullptr );
1481+
1482+ // Version 0 (before corruption) should be readable
1483+ code = walReadVer (pRead, 0 );
1484+ ASSERT_EQ (code, 0 );
1485+
1486+ // At least one entry in the corrupted region should fail
1487+ bool foundError = false ;
1488+ for (int i = 0 ; i < 20 ; i++) {
1489+ if (walReadVer (pRead, i) != 0 ) {
1490+ foundError = true ;
1491+ printf (" Read error at version %d as expected\n " , i);
1492+ break ;
1493+ }
1494+ }
1495+ ASSERT_TRUE (foundError) << " Expected at least one read failure due to middle corruption" ;
1496+
1497+ walCloseReader (pRead);
1498+
1499+ printf (" SUCCESS: Middle corruption detected at read time as expected\n " );
12001500}
0 commit comments