Skip to content

Commit 1a298a1

Browse files
Merge pull request #30243 from PannagaRao/monitor-test-panic
OCPBUGS-48186: Add kubelet and CRI-O panic detection invariant test
2 parents e569d8b + 4f16991 commit 1a298a1

4 files changed

Lines changed: 138 additions & 0 deletions

File tree

pkg/monitor/monitorapi/types.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -363,6 +363,7 @@ const (
363363
SourceUnexpectedReady IntervalSource = "NodeUnexpectedNotReady"
364364
SourceUnreachable IntervalSource = "NodeUnreachable"
365365
SourceKubeletLog IntervalSource = "KubeletLog"
366+
SourceCrioLog IntervalSource = "CrioLog"
366367
SourceSystemdCoreDumpLog IntervalSource = "SystemdCoreDumpLog"
367368
SourcePodLog IntervalSource = "PodLog"
368369
SourceEtcdLog IntervalSource = "EtcdLog"
@@ -393,6 +394,8 @@ const (
393394
SourceEtcdDiskCommitDuration IntervalSource = "EtcdDiskCommitDuration"
394395
SourceEtcdDiskWalFsyncDuration IntervalSource = "EtcdDiskWalFsyncDuration"
395396
SourceTestBucket IntervalSource = "TestBucket"
397+
KubeletPanic IntervalReason = "KubeletPanic"
398+
CrioPanic IntervalReason = "CrioPanic"
396399
)
397400

398401
type Interval struct {

pkg/monitortests/node/kubeletlogcollector/monitortest.go

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ func (w *kubeletLogCollector) EvaluateTestsFromConstructedIntervals(ctx context.
6161
junits = append(junits, nodeFailedLeaseErrorsInRapidSuccession(w.startedAt, finalIntervals)...)
6262
junits = append(junits, nodeFailedLeaseErrorsBackOff(w.startedAt, finalIntervals)...)
6363
junits = append(junits, testNoSystemdCoreDumps(finalIntervals)...)
64+
junits = append(junits, nodeKubeletAndCrioPanicsInvariant(w.startedAt, finalIntervals)...)
6465
return junits, nil
6566
}
6667

@@ -173,3 +174,29 @@ func testNoSystemdCoreDumps(events monitorapi.Intervals) []*junitapi.JUnitTestCa
173174

174175
return []*junitapi.JUnitTestCase{failure}
175176
}
177+
178+
func nodeKubeletAndCrioPanicsInvariant(startedAt time.Time, finalIntervals monitorapi.Intervals) []*junitapi.JUnitTestCase {
179+
const testName = "[sig-node] node-system-log-collector detects kubelet or CRI-O panics"
180+
var failures []string
181+
182+
intervalsToFailOn := findKubeletAndCrioPanics(finalIntervals)
183+
for _, event := range intervalsToFailOn {
184+
if event.From.After(startedAt) {
185+
failures = append(failures, fmt.Sprintf("%s %v - %v", event.From.Format(time.RFC3339), event.Locator.OldLocator(), event.Message.OldMessage()))
186+
}
187+
}
188+
189+
var tests []*junitapi.JUnitTestCase
190+
if len(failures) > 0 {
191+
tests = append(tests, &junitapi.JUnitTestCase{
192+
Name: testName,
193+
SystemOut: strings.Join(failures, "\n"),
194+
FailureOutput: &junitapi.FailureOutput{
195+
Output: fmt.Sprintf("kubelet-log-collector reports %d kubelet or CRI-O panics.\n\n%v", len(failures), strings.Join(failures, "\n")),
196+
},
197+
})
198+
} else {
199+
tests = append(tests, &junitapi.JUnitTestCase{Name: testName})
200+
}
201+
return tests
202+
}

pkg/monitortests/node/kubeletlogcollector/node.go

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,12 +74,21 @@ func intervalsFromNodeLogs(ctx context.Context, kubeClient kubernetes.Interface,
7474
}
7575
newSystemdCoreDumpIntervals := intervalsFromSystemdCoreDumpLogs(nodeName, systemdCoreDumpLogs)
7676

77+
crioLogs, err := getNodeLog(ctx, kubeClient, nodeName, "crio")
78+
if err != nil {
79+
fmt.Fprintf(os.Stderr, "Error getting node crio logs from %s: %s", nodeName, err.Error())
80+
errCh <- err
81+
return
82+
}
83+
newCrioEvents := eventsFromCrioLogs(nodeName, crioLogs)
84+
7785
lock.Lock()
7886
defer lock.Unlock()
7987
ret = append(ret, newEvents...)
8088
ret = append(ret, newOVSEvents...)
8189
ret = append(ret, newNetworkManagerIntervals...)
8290
ret = append(ret, newSystemdCoreDumpIntervals...)
91+
ret = append(ret, newCrioEvents...)
8392
}(ctx, node.Name)
8493
}
8594
wg.Wait()
@@ -118,6 +127,7 @@ func eventsFromKubeletLogs(nodeName string, kubeletLog []byte) monitorapi.Interv
118127
ret = append(ret, leaseUpdateError(nodeLocator, currLine)...)
119128
ret = append(ret, leaseFailBackOff(nodeLocator, currLine)...)
120129
ret = append(ret, parse(nodeName, currLine)...)
130+
ret = append(ret, kubeletPanicDetected(nodeName, currLine)...)
121131
}
122132

123133
return ret
@@ -712,3 +722,63 @@ func getNodeLog(ctx context.Context, client kubernetes.Interface, nodeName, syst
712722

713723
return ioutil.ReadAll(in)
714724
}
725+
726+
var panicHeadlineRegex = regexp.MustCompile(`(panic:|fatal error:)`)
727+
728+
func kubeletPanicDetected(nodeName, logLine string) monitorapi.Intervals {
729+
return panicDetected(nodeName, logLine,
730+
monitorapi.SourceKubeletLog,
731+
monitorapi.KubeletPanic,
732+
"kubelet panic detected, check logs for details")
733+
}
734+
735+
// eventsFromCrioLogs returns the produced intervals from CRI-O logs.
736+
// Right now it only detects panics, but more detectors can be added as needed.
737+
func eventsFromCrioLogs(nodeName string, crioLog []byte) monitorapi.Intervals {
738+
ret := monitorapi.Intervals{}
739+
740+
scanner := bufio.NewScanner(bytes.NewBuffer(crioLog))
741+
for scanner.Scan() {
742+
currLine := scanner.Text()
743+
ret = append(ret, crioPanicDetected(nodeName, currLine)...)
744+
}
745+
746+
return ret
747+
}
748+
749+
func crioPanicDetected(nodeName, logLine string) monitorapi.Intervals {
750+
return panicDetected(nodeName, logLine,
751+
monitorapi.SourceCrioLog,
752+
monitorapi.CrioPanic,
753+
"CRI-O panic detected, check logs for details")
754+
}
755+
756+
func panicDetected(nodeName, logLine string, source monitorapi.IntervalSource, reason monitorapi.IntervalReason, human string) monitorapi.Intervals {
757+
if !panicHeadlineRegex.MatchString(logLine) {
758+
return nil
759+
}
760+
761+
failureTime := utility.SystemdJournalLogTime(logLine, time.Now().Year())
762+
nodeLocator := monitorapi.NewLocator().NodeFromName(nodeName)
763+
764+
return monitorapi.Intervals{
765+
monitorapi.NewInterval(source, monitorapi.Error).
766+
Locator(nodeLocator).
767+
Message(monitorapi.NewMessage().
768+
Reason(reason).
769+
HumanMessage(human)).
770+
Display().
771+
Build(failureTime, failureTime.Add(time.Second)),
772+
}
773+
}
774+
775+
// findKubeletAndCrioPanics returns all intervals with Reason KubeletPanic or CrioPanic.
776+
func findKubeletAndCrioPanics(intervals monitorapi.Intervals) monitorapi.Intervals {
777+
var panics monitorapi.Intervals
778+
for _, interval := range intervals {
779+
if interval.Message.Reason == monitorapi.KubeletPanic || interval.Message.Reason == monitorapi.CrioPanic {
780+
panics = append(panics, interval)
781+
}
782+
}
783+
return panics
784+
}

pkg/monitortests/node/kubeletlogcollector/node_test.go

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1243,3 +1243,41 @@ func Test_processCoreDump(t *testing.T) {
12431243
})
12441244
}
12451245
}
1246+
1247+
func TestKubeletPanicDetected(t *testing.T) {
1248+
nodeName := "test-node"
1249+
panicLog := "panic: runtime error: invalid memory address or nil pointer dereference"
1250+
nonPanicLog := "normal log line"
1251+
1252+
intervals := kubeletPanicDetected(nodeName, panicLog)
1253+
if len(intervals) == 0 {
1254+
t.Errorf("Expected interval for panic log, got none")
1255+
}
1256+
if intervals[0].Condition.Message.HumanMessage != "kubelet panic detected, check logs for details" {
1257+
t.Errorf("Unexpected message: %v", intervals[0].Condition.Message.HumanMessage)
1258+
}
1259+
1260+
intervals = kubeletPanicDetected(nodeName, nonPanicLog)
1261+
if intervals != nil {
1262+
t.Errorf("Expected nil for non-panic log, got: %v", intervals)
1263+
}
1264+
}
1265+
1266+
func TestCrioPanicDetected(t *testing.T) {
1267+
nodeName := "test-node"
1268+
panicLog := "panic: runtime error: invalid memory address or nil pointer dereference"
1269+
nonPanicLog := "normal log line"
1270+
1271+
intervals := crioPanicDetected(nodeName, panicLog)
1272+
if len(intervals) == 0 {
1273+
t.Errorf("Expected interval for CRI-O panic log, got none")
1274+
}
1275+
if intervals[0].Condition.Message.HumanMessage != "CRI-O panic detected, check logs for details" {
1276+
t.Errorf("Unexpected message: %v", intervals[0].Condition.Message.HumanMessage)
1277+
}
1278+
1279+
intervals = crioPanicDetected(nodeName, nonPanicLog)
1280+
if intervals != nil {
1281+
t.Errorf("Expected nil for non-panic log, got: %v", intervals)
1282+
}
1283+
}

0 commit comments

Comments
 (0)