@@ -4,6 +4,7 @@ package utils
44import (
55 "context"
66 "encoding/json"
7+ "encoding/xml"
78 "fmt"
89 "net"
910 "slices"
@@ -24,6 +25,7 @@ import (
2425 "k8s.io/apimachinery/pkg/util/yaml"
2526 "k8s.io/klog/v2"
2627 "k8s.io/kubectl/pkg/util/podutils"
28+ nodeutil "k8s.io/kubernetes/pkg/util/node"
2729 "k8s.io/kubernetes/test/e2e/framework"
2830 nodehelper "k8s.io/kubernetes/test/e2e/framework/node"
2931 e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
@@ -40,8 +42,39 @@ const (
4042
4143 clusterIsHealthyTimeout = 5 * time .Minute
4244 pollInterval = 5 * time .Second
45+ // Pacemaker timestamp format for parsing operation history
46+ pacemakerTimeFormat = "Mon Jan 2 15:04:05 2006"
4347)
4448
49+ // Minimal XML types for parsing "pcs status xml" node history.
50+ // Used to detect recent resource failures via operation history.
51+ type pcsStatusResult struct {
52+ XMLName xml.Name `xml:"pacemaker-result"`
53+ NodeHistory pcsNodeHistory `xml:"node_history"`
54+ }
55+
56+ type pcsNodeHistory struct {
57+ Node []pcsNodeHistoryNode `xml:"node"`
58+ }
59+
60+ type pcsNodeHistoryNode struct {
61+ Name string `xml:"name,attr"`
62+ ResourceHistory []pcsResourceHistory `xml:"resource_history"`
63+ }
64+
65+ type pcsResourceHistory struct {
66+ ID string `xml:"id,attr"`
67+ OperationHistory []pcsOperationHistory `xml:"operation_history"`
68+ }
69+
70+ // pcsOperationHistory tracks resource operation results for failure detection.
71+ type pcsOperationHistory struct {
72+ Task string `xml:"task,attr"`
73+ RC string `xml:"rc,attr"`
74+ RCText string `xml:"rc_text,attr"`
75+ LastRCChange string `xml:"last-rc-change,attr"`
76+ }
77+
4578// DecodeObject decodes YAML or JSON data into a Kubernetes runtime object using generics.
4679//
4780// var bmh metal3v1alpha1.BareMetalHost
@@ -304,6 +337,85 @@ func IsServiceRunning(oc *exutil.CLI, execNode string, targetNode string, servic
304337 return isActive
305338}
306339
340+ // RecentResourceFailure captures details about a resource failure from Pacemaker operation history.
341+ type RecentResourceFailure struct {
342+ ResourceID string
343+ Task string
344+ NodeName string
345+ RC string
346+ RCText string
347+ LastRCChange time.Time
348+ }
349+
350+ // HasRecentResourceFailure checks if a resource had any failed operations within the given time window.
351+ // Uses "pcs status xml" to parse the node_history section for operations with non-zero return codes.
352+ // This is useful for detecting that pacemaker noticed and responded to a resource failure,
353+ // even if auto-recovery has already restored the resource.
354+ //
355+ // hasFailure, failures, err := HasRecentResourceFailure(oc, "master-0", "kubelet-clone", 5*time.Minute)
356+ func HasRecentResourceFailure (oc * exutil.CLI , execNodeName string , resourceID string , timeWindow time.Duration ) (bool , []RecentResourceFailure , error ) {
357+ framework .Logf ("Checking for recent failures of resource %s within %v" , resourceID , timeWindow )
358+
359+ output , err := exutil .DebugNodeRetryWithOptionsAndChroot (
360+ oc , execNodeName , "default" , "bash" , "-c" , "sudo pcs status xml" )
361+
362+ if err != nil {
363+ return false , nil , fmt .Errorf ("failed to get pcs status xml: %v" , err )
364+ }
365+
366+ var result pcsStatusResult
367+ if parseErr := xml .Unmarshal ([]byte (output ), & result ); parseErr != nil {
368+ return false , nil , fmt .Errorf ("failed to parse pcs status xml: %v" , parseErr )
369+ }
370+
371+ cutoffTime := time .Now ().Add (- timeWindow )
372+ var failures []RecentResourceFailure
373+
374+ for _ , node := range result .NodeHistory .Node {
375+ // Match resource ID (handles clone resources like "kubelet-clone" matching "kubelet:0", "kubelet:1")
376+ for _ , resourceHistory := range node .ResourceHistory {
377+ if ! strings .HasPrefix (resourceHistory .ID , strings .TrimSuffix (resourceID , "-clone" )) {
378+ continue
379+ }
380+
381+ for _ , operation := range resourceHistory .OperationHistory {
382+ // RC "0" means success, anything else is a failure
383+ if operation .RC == "0" {
384+ continue
385+ }
386+
387+ // Parse the timestamp
388+ opTime , parseErr := time .Parse (pacemakerTimeFormat , operation .LastRCChange )
389+ if parseErr != nil {
390+ framework .Logf ("Warning: failed to parse timestamp %q: %v" , operation .LastRCChange , parseErr )
391+ continue
392+ }
393+
394+ // Check if within time window
395+ if ! opTime .After (cutoffTime ) {
396+ continue
397+ }
398+
399+ failure := RecentResourceFailure {
400+ ResourceID : resourceHistory .ID ,
401+ Task : operation .Task ,
402+ NodeName : node .Name ,
403+ RC : operation .RC ,
404+ RCText : operation .RCText ,
405+ LastRCChange : opTime ,
406+ }
407+ failures = append (failures , failure )
408+ framework .Logf ("Found recent failure: resource=%s task=%s node=%s rc=%s (%s) at %s" ,
409+ failure .ResourceID , failure .Task , failure .NodeName , failure .RC , failure .RCText , failure .LastRCChange )
410+ }
411+ }
412+ }
413+
414+ hasFailure := len (failures ) > 0
415+ framework .Logf ("Resource %s has %d recent failures within %v window" , resourceID , len (failures ), timeWindow )
416+ return hasFailure , failures , nil
417+ }
418+
307419// ValidateClusterOperatorsAvailable validates that all cluster operators are available and not degraded.
308420//
309421// if err := ValidateClusterOperatorsAvailable(oc); err != nil { return err }
@@ -511,26 +623,17 @@ func LogEtcdClusterStatus(oc *exutil.CLI, testContext string, etcdClientFactory
511623 } else {
512624 framework .Logf ("=== Enhanced Node and Etcd Member Analysis ===" )
513625
514- // Check if both nodes are healthy
626+ // Check if both nodes are healthy using vendored nodeutil.IsNodeReady
515627 framework .Logf ("Checking node health status..." )
516- healthyNodes := 0
517628 readyNodes := 0
518- for _ , node := range nodeList .Items {
519- isReady := false
520- for _ , condition := range node .Status .Conditions {
521- if condition .Type == corev1 .NodeReady && condition .Status == corev1 .ConditionTrue {
522- isReady = true
523- readyNodes ++
524- break
525- }
526- }
527-
528- framework .Logf (" - Node %s: Ready=%t, Roles=%s" ,
529- node .Name , isReady , getNodeRoles (& node ))
530-
629+ for i := range nodeList .Items {
630+ node := & nodeList .Items [i ]
631+ isReady := nodeutil .IsNodeReady (node )
531632 if isReady {
532- healthyNodes ++
633+ readyNodes ++
533634 }
635+ framework .Logf (" - Node %s: Ready=%t, Roles=%s" ,
636+ node .Name , isReady , getNodeRoles (node ))
534637 }
535638 framework .Logf ("Node health summary: %d total nodes, %d ready nodes" , len (nodeList .Items ), readyNodes )
536639
@@ -540,12 +643,24 @@ func LogEtcdClusterStatus(oc *exutil.CLI, testContext string, etcdClientFactory
540643 learnerMembers := 0
541644 healthyMembers := 0
542645
543- for _ , node := range nodeList .Items {
646+ // Fetch etcd members once for all nodes
647+ var members []* etcdserverpb.Member
648+ if etcdClientFactory != nil {
649+ var err error
650+ members , err = GetMembers (etcdClientFactory )
651+ if err != nil {
652+ framework .Logf ("WARNING: Failed to get etcd members: %v" , err )
653+ }
654+ }
655+
656+ for i := range nodeList .Items {
657+ node := & nodeList .Items [i ]
544658 // Check if this node has an etcd pod
545659 var etcdPod * corev1.Pod
546- for _ , pod := range etcdPods .Items {
660+ for j := range etcdPods .Items {
661+ pod := & etcdPods .Items [j ]
547662 if pod .Spec .NodeName == node .Name && pod .Status .Phase == corev1 .PodRunning {
548- etcdPod = & pod
663+ etcdPod = pod
549664 break
550665 }
551666 }
@@ -554,19 +669,19 @@ func LogEtcdClusterStatus(oc *exutil.CLI, testContext string, etcdClientFactory
554669 framework .Logf (" - Node %s: has running etcd pod %s" , node .Name , etcdPod .Name )
555670 healthyMembers ++
556671
557- // Try to determine if member is promoted (voting) or learner
558- // Only check if etcdClientFactory is provided
559- if etcdClientFactory != nil {
560- memberStatus := checkEtcdMemberPromotionStatus (oc , node .Name , etcdClientFactory )
561- switch memberStatus {
562- case "voting" :
563- votingMembers ++
564- framework .Logf (" └─ Member status: VOTING (promoted)" )
565- case "learner" :
672+ // Determine if member is promoted (voting) or learner using pre-fetched members
673+ if members != nil {
674+ started , isLearner , err := GetMemberState (node , members )
675+ if err != nil {
676+ framework .Logf (" └─ Member status: UNKNOWN (%v)" , err )
677+ } else if ! started {
678+ framework .Logf (" └─ Member status: NOT STARTED (added but not joined)" )
679+ } else if isLearner {
566680 learnerMembers ++
567681 framework .Logf (" └─ Member status: LEARNER (not yet promoted)" )
568- default :
569- framework .Logf (" └─ Member status: UNKNOWN (unable to determine)" )
682+ } else {
683+ votingMembers ++
684+ framework .Logf (" └─ Member status: VOTING (promoted)" )
570685 }
571686 } else {
572687 framework .Logf (" └─ Member status: UNKNOWN (etcdClientFactory not provided)" )
@@ -805,35 +920,6 @@ func getNodeRoles(node *corev1.Node) string {
805920 return strings .Join (roles , "," )
806921}
807922
808- // checkEtcdMemberPromotionStatus queries the etcd API to determine if a specific member is promoted (voting) or learner.
809- // Returns "voting", "learner", or "unknown" based on the actual member status from etcd.
810- func checkEtcdMemberPromotionStatus (oc * exutil.CLI , nodeName string , etcdClientFactory * helpers.EtcdClientFactoryImpl ) string {
811- etcdClient , closeFn , err := etcdClientFactory .NewEtcdClient ()
812- if err != nil {
813- return "unknown"
814- }
815- defer closeFn ()
816-
817- ctx , cancel := context .WithTimeout (context .Background (), 15 * time .Second )
818- defer cancel ()
819- memberList , err := etcdClient .MemberList (ctx )
820- if err != nil {
821- return "unknown"
822- }
823-
824- // Find the member corresponding to this node
825- for _ , member := range memberList .Members {
826- if member .Name == nodeName {
827- if member .IsLearner {
828- return "learner"
829- }
830- return "voting"
831- }
832- }
833-
834- return "unknown"
835- }
836-
837923// checkCEORevisionControllerStatus checks the status of the Cluster Etcd Operator revision controller
838924func checkCEORevisionControllerStatus (oc * exutil.CLI ) error {
839925 framework .Logf ("Checking CEO revision controller status..." )
0 commit comments