@@ -74,12 +74,21 @@ func intervalsFromNodeLogs(ctx context.Context, kubeClient kubernetes.Interface,
7474 }
7575 newSystemdCoreDumpIntervals := intervalsFromSystemdCoreDumpLogs (nodeName , systemdCoreDumpLogs )
7676
77+ crioLogs , err := getNodeLog (ctx , kubeClient , nodeName , "crio" )
78+ if err != nil {
79+ fmt .Fprintf (os .Stderr , "Error getting node crio logs from %s: %s" , nodeName , err .Error ())
80+ errCh <- err
81+ return
82+ }
83+ newCrioEvents := eventsFromCrioLogs (nodeName , crioLogs )
84+
7785 lock .Lock ()
7886 defer lock .Unlock ()
7987 ret = append (ret , newEvents ... )
8088 ret = append (ret , newOVSEvents ... )
8189 ret = append (ret , newNetworkManagerIntervals ... )
8290 ret = append (ret , newSystemdCoreDumpIntervals ... )
91+ ret = append (ret , newCrioEvents ... )
8392 }(ctx , node .Name )
8493 }
8594 wg .Wait ()
@@ -118,6 +127,7 @@ func eventsFromKubeletLogs(nodeName string, kubeletLog []byte) monitorapi.Interv
118127 ret = append (ret , leaseUpdateError (nodeLocator , currLine )... )
119128 ret = append (ret , leaseFailBackOff (nodeLocator , currLine )... )
120129 ret = append (ret , parse (nodeName , currLine )... )
130+ ret = append (ret , kubeletPanicDetected (nodeName , currLine )... )
121131 }
122132
123133 return ret
@@ -712,3 +722,63 @@ func getNodeLog(ctx context.Context, client kubernetes.Interface, nodeName, syst
712722
713723 return ioutil .ReadAll (in )
714724}
725+
726+ var panicHeadlineRegex = regexp .MustCompile (`(panic:|fatal error:)` )
727+
728+ func kubeletPanicDetected (nodeName , logLine string ) monitorapi.Intervals {
729+ return panicDetected (nodeName , logLine ,
730+ monitorapi .SourceKubeletLog ,
731+ monitorapi .KubeletPanic ,
732+ "kubelet panic detected, check logs for details" )
733+ }
734+
735+ // eventsFromCrioLogs returns the produced intervals from CRI-O logs.
736+ // Right now it only detects panics, but more detectors can be added as needed.
737+ func eventsFromCrioLogs (nodeName string , crioLog []byte ) monitorapi.Intervals {
738+ ret := monitorapi.Intervals {}
739+
740+ scanner := bufio .NewScanner (bytes .NewBuffer (crioLog ))
741+ for scanner .Scan () {
742+ currLine := scanner .Text ()
743+ ret = append (ret , crioPanicDetected (nodeName , currLine )... )
744+ }
745+
746+ return ret
747+ }
748+
749+ func crioPanicDetected (nodeName , logLine string ) monitorapi.Intervals {
750+ return panicDetected (nodeName , logLine ,
751+ monitorapi .SourceCrioLog ,
752+ monitorapi .CrioPanic ,
753+ "CRI-O panic detected, check logs for details" )
754+ }
755+
756+ func panicDetected (nodeName , logLine string , source monitorapi.IntervalSource , reason monitorapi.IntervalReason , human string ) monitorapi.Intervals {
757+ if ! panicHeadlineRegex .MatchString (logLine ) {
758+ return nil
759+ }
760+
761+ failureTime := utility .SystemdJournalLogTime (logLine , time .Now ().Year ())
762+ nodeLocator := monitorapi .NewLocator ().NodeFromName (nodeName )
763+
764+ return monitorapi.Intervals {
765+ monitorapi .NewInterval (source , monitorapi .Error ).
766+ Locator (nodeLocator ).
767+ Message (monitorapi .NewMessage ().
768+ Reason (reason ).
769+ HumanMessage (human )).
770+ Display ().
771+ Build (failureTime , failureTime .Add (time .Second )),
772+ }
773+ }
774+
775+ // findKubeletAndCrioPanics returns all intervals with Reason KubeletPanic or CrioPanic.
776+ func findKubeletAndCrioPanics (intervals monitorapi.Intervals ) monitorapi.Intervals {
777+ var panics monitorapi.Intervals
778+ for _ , interval := range intervals {
779+ if interval .Message .Reason == monitorapi .KubeletPanic || interval .Message .Reason == monitorapi .CrioPanic {
780+ panics = append (panics , interval )
781+ }
782+ }
783+ return panics
784+ }
0 commit comments