Skip to content

Commit 16bd817

Browse files
committed
Fixes suggestedby jpoulin
1 parent 940c08c commit 16bd817

2 files changed

Lines changed: 157 additions & 71 deletions

File tree

test/extended/two_node/tnf_kubelet_disruption.go

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -191,28 +191,28 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
191191
return isRunning
192192
}, kubeletGracePeriod, kubeletPollInterval).Should(o.BeTrue(), fmt.Sprintf("Kubelet service should be running initially on node %s", targetNode.Name))
193193

194+
// Record the time before stopping kubelet to filter failures
195+
stopTime := time.Now()
196+
194197
g.By(fmt.Sprintf("Stopping kubelet service on target node: %s", targetNode.Name))
195198
err = utils.StopKubeletService(oc, targetNode.Name)
196199
o.Expect(err).To(o.BeNil(), fmt.Sprintf("Expected to stop kubelet service on node %s without errors", targetNode.Name))
197200

198-
g.By("Verifying Pacemaker detected kubelet as stopped via pcs status")
199-
o.Eventually(func() bool {
200-
stopped, err := utils.IsResourceStopped(oc, survivingNode.Name, "kubelet-clone")
201-
if err != nil {
202-
framework.Logf("Error checking kubelet-clone status: %v", err)
203-
return false
204-
}
205-
framework.Logf("kubelet-clone stopped on %s: %v", targetNode.Name, stopped)
206-
return stopped
207-
}, kubeletRestoreTimeout, kubeletPollInterval).Should(o.BeTrue(), "Pacemaker should detect kubelet-clone as stopped")
208-
209-
g.By("Verifying Pacemaker restarted kubelet-clone service")
201+
g.By("Waiting for Pacemaker to auto-recover and restart kubelet-clone service")
210202
o.Eventually(func() bool {
211203
isRunning := utils.IsServiceRunning(oc, survivingNode.Name, targetNode.Name, "kubelet")
212204
framework.Logf("Kubelet running on %s: %v", targetNode.Name, isRunning)
213205
return isRunning
214206
}, kubeletRestoreTimeout, kubeletPollInterval).Should(o.BeTrue(), fmt.Sprintf("Kubelet should be running on %s after Pacemaker restart", targetNode.Name))
215207

208+
g.By("Verifying Pacemaker recorded the kubelet failure in operation history")
209+
// Use a time window from when we stopped kubelet to now
210+
failureWindow := time.Since(stopTime) + time.Minute // Add buffer for clock skew
211+
hasFailure, failures, err := utils.HasRecentResourceFailure(oc, survivingNode.Name, "kubelet-clone", failureWindow)
212+
o.Expect(err).To(o.BeNil(), "Expected to check resource failure history without errors")
213+
o.Expect(hasFailure).To(o.BeTrue(), "Pacemaker should have recorded kubelet failure in operation history")
214+
framework.Logf("Pacemaker recorded %d failure(s) for kubelet-clone: %+v", len(failures), failures)
215+
216216
g.By("Validating both nodes are Ready after Pacemaker restart")
217217
for _, node := range nodes {
218218
o.Eventually(func() bool {

test/extended/two_node/utils/common.go

Lines changed: 145 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ package utils
44
import (
55
"context"
66
"encoding/json"
7+
"encoding/xml"
78
"fmt"
89
"net"
910
"slices"
@@ -24,6 +25,7 @@ import (
2425
"k8s.io/apimachinery/pkg/util/yaml"
2526
"k8s.io/klog/v2"
2627
"k8s.io/kubectl/pkg/util/podutils"
28+
nodeutil "k8s.io/kubernetes/pkg/util/node"
2729
"k8s.io/kubernetes/test/e2e/framework"
2830
nodehelper "k8s.io/kubernetes/test/e2e/framework/node"
2931
e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
@@ -40,8 +42,39 @@ const (
4042

4143
clusterIsHealthyTimeout = 5 * time.Minute
4244
pollInterval = 5 * time.Second
45+
// Pacemaker timestamp format for parsing operation history
46+
pacemakerTimeFormat = "Mon Jan 2 15:04:05 2006"
4347
)
4448

49+
// Minimal XML types for parsing "pcs status xml" node history.
50+
// Used to detect recent resource failures via operation history.
51+
type pcsStatusResult struct {
52+
XMLName xml.Name `xml:"pacemaker-result"`
53+
NodeHistory pcsNodeHistory `xml:"node_history"`
54+
}
55+
56+
type pcsNodeHistory struct {
57+
Node []pcsNodeHistoryNode `xml:"node"`
58+
}
59+
60+
type pcsNodeHistoryNode struct {
61+
Name string `xml:"name,attr"`
62+
ResourceHistory []pcsResourceHistory `xml:"resource_history"`
63+
}
64+
65+
type pcsResourceHistory struct {
66+
ID string `xml:"id,attr"`
67+
OperationHistory []pcsOperationHistory `xml:"operation_history"`
68+
}
69+
70+
// pcsOperationHistory tracks resource operation results for failure detection.
71+
type pcsOperationHistory struct {
72+
Task string `xml:"task,attr"`
73+
RC string `xml:"rc,attr"`
74+
RCText string `xml:"rc_text,attr"`
75+
LastRCChange string `xml:"last-rc-change,attr"`
76+
}
77+
4578
// DecodeObject decodes YAML or JSON data into a Kubernetes runtime object using generics.
4679
//
4780
// var bmh metal3v1alpha1.BareMetalHost
@@ -304,6 +337,85 @@ func IsServiceRunning(oc *exutil.CLI, execNode string, targetNode string, servic
304337
return isActive
305338
}
306339

340+
// RecentResourceFailure captures details about a resource failure from Pacemaker operation history.
341+
type RecentResourceFailure struct {
342+
ResourceID string
343+
Task string
344+
NodeName string
345+
RC string
346+
RCText string
347+
LastRCChange time.Time
348+
}
349+
350+
// HasRecentResourceFailure checks if a resource had any failed operations within the given time window.
351+
// Uses "pcs status xml" to parse the node_history section for operations with non-zero return codes.
352+
// This is useful for detecting that pacemaker noticed and responded to a resource failure,
353+
// even if auto-recovery has already restored the resource.
354+
//
355+
// hasFailure, failures, err := HasRecentResourceFailure(oc, "master-0", "kubelet-clone", 5*time.Minute)
356+
func HasRecentResourceFailure(oc *exutil.CLI, execNodeName string, resourceID string, timeWindow time.Duration) (bool, []RecentResourceFailure, error) {
357+
framework.Logf("Checking for recent failures of resource %s within %v", resourceID, timeWindow)
358+
359+
output, err := exutil.DebugNodeRetryWithOptionsAndChroot(
360+
oc, execNodeName, "default", "bash", "-c", "sudo pcs status xml")
361+
362+
if err != nil {
363+
return false, nil, fmt.Errorf("failed to get pcs status xml: %v", err)
364+
}
365+
366+
var result pcsStatusResult
367+
if parseErr := xml.Unmarshal([]byte(output), &result); parseErr != nil {
368+
return false, nil, fmt.Errorf("failed to parse pcs status xml: %v", parseErr)
369+
}
370+
371+
cutoffTime := time.Now().Add(-timeWindow)
372+
var failures []RecentResourceFailure
373+
374+
for _, node := range result.NodeHistory.Node {
375+
// Match resource ID (handles clone resources like "kubelet-clone" matching "kubelet:0", "kubelet:1")
376+
for _, resourceHistory := range node.ResourceHistory {
377+
if !strings.HasPrefix(resourceHistory.ID, strings.TrimSuffix(resourceID, "-clone")) {
378+
continue
379+
}
380+
381+
for _, operation := range resourceHistory.OperationHistory {
382+
// RC "0" means success, anything else is a failure
383+
if operation.RC == "0" {
384+
continue
385+
}
386+
387+
// Parse the timestamp
388+
opTime, parseErr := time.Parse(pacemakerTimeFormat, operation.LastRCChange)
389+
if parseErr != nil {
390+
framework.Logf("Warning: failed to parse timestamp %q: %v", operation.LastRCChange, parseErr)
391+
continue
392+
}
393+
394+
// Check if within time window
395+
if !opTime.After(cutoffTime) {
396+
continue
397+
}
398+
399+
failure := RecentResourceFailure{
400+
ResourceID: resourceHistory.ID,
401+
Task: operation.Task,
402+
NodeName: node.Name,
403+
RC: operation.RC,
404+
RCText: operation.RCText,
405+
LastRCChange: opTime,
406+
}
407+
failures = append(failures, failure)
408+
framework.Logf("Found recent failure: resource=%s task=%s node=%s rc=%s (%s) at %s",
409+
failure.ResourceID, failure.Task, failure.NodeName, failure.RC, failure.RCText, failure.LastRCChange)
410+
}
411+
}
412+
}
413+
414+
hasFailure := len(failures) > 0
415+
framework.Logf("Resource %s has %d recent failures within %v window", resourceID, len(failures), timeWindow)
416+
return hasFailure, failures, nil
417+
}
418+
307419
// ValidateClusterOperatorsAvailable validates that all cluster operators are available and not degraded.
308420
//
309421
// if err := ValidateClusterOperatorsAvailable(oc); err != nil { return err }
@@ -511,26 +623,17 @@ func LogEtcdClusterStatus(oc *exutil.CLI, testContext string, etcdClientFactory
511623
} else {
512624
framework.Logf("=== Enhanced Node and Etcd Member Analysis ===")
513625

514-
// Check if both nodes are healthy
626+
// Check if both nodes are healthy using vendored nodeutil.IsNodeReady
515627
framework.Logf("Checking node health status...")
516-
healthyNodes := 0
517628
readyNodes := 0
518-
for _, node := range nodeList.Items {
519-
isReady := false
520-
for _, condition := range node.Status.Conditions {
521-
if condition.Type == corev1.NodeReady && condition.Status == corev1.ConditionTrue {
522-
isReady = true
523-
readyNodes++
524-
break
525-
}
526-
}
527-
528-
framework.Logf(" - Node %s: Ready=%t, Roles=%s",
529-
node.Name, isReady, getNodeRoles(&node))
530-
629+
for i := range nodeList.Items {
630+
node := &nodeList.Items[i]
631+
isReady := nodeutil.IsNodeReady(node)
531632
if isReady {
532-
healthyNodes++
633+
readyNodes++
533634
}
635+
framework.Logf(" - Node %s: Ready=%t, Roles=%s",
636+
node.Name, isReady, getNodeRoles(node))
534637
}
535638
framework.Logf("Node health summary: %d total nodes, %d ready nodes", len(nodeList.Items), readyNodes)
536639

@@ -540,12 +643,24 @@ func LogEtcdClusterStatus(oc *exutil.CLI, testContext string, etcdClientFactory
540643
learnerMembers := 0
541644
healthyMembers := 0
542645

543-
for _, node := range nodeList.Items {
646+
// Fetch etcd members once for all nodes
647+
var members []*etcdserverpb.Member
648+
if etcdClientFactory != nil {
649+
var err error
650+
members, err = GetMembers(etcdClientFactory)
651+
if err != nil {
652+
framework.Logf("WARNING: Failed to get etcd members: %v", err)
653+
}
654+
}
655+
656+
for i := range nodeList.Items {
657+
node := &nodeList.Items[i]
544658
// Check if this node has an etcd pod
545659
var etcdPod *corev1.Pod
546-
for _, pod := range etcdPods.Items {
660+
for j := range etcdPods.Items {
661+
pod := &etcdPods.Items[j]
547662
if pod.Spec.NodeName == node.Name && pod.Status.Phase == corev1.PodRunning {
548-
etcdPod = &pod
663+
etcdPod = pod
549664
break
550665
}
551666
}
@@ -554,19 +669,19 @@ func LogEtcdClusterStatus(oc *exutil.CLI, testContext string, etcdClientFactory
554669
framework.Logf(" - Node %s: has running etcd pod %s", node.Name, etcdPod.Name)
555670
healthyMembers++
556671

557-
// Try to determine if member is promoted (voting) or learner
558-
// Only check if etcdClientFactory is provided
559-
if etcdClientFactory != nil {
560-
memberStatus := checkEtcdMemberPromotionStatus(oc, node.Name, etcdClientFactory)
561-
switch memberStatus {
562-
case "voting":
563-
votingMembers++
564-
framework.Logf(" └─ Member status: VOTING (promoted)")
565-
case "learner":
672+
// Determine if member is promoted (voting) or learner using pre-fetched members
673+
if members != nil {
674+
started, isLearner, err := GetMemberState(node, members)
675+
if err != nil {
676+
framework.Logf(" └─ Member status: UNKNOWN (%v)", err)
677+
} else if !started {
678+
framework.Logf(" └─ Member status: NOT STARTED (added but not joined)")
679+
} else if isLearner {
566680
learnerMembers++
567681
framework.Logf(" └─ Member status: LEARNER (not yet promoted)")
568-
default:
569-
framework.Logf(" └─ Member status: UNKNOWN (unable to determine)")
682+
} else {
683+
votingMembers++
684+
framework.Logf(" └─ Member status: VOTING (promoted)")
570685
}
571686
} else {
572687
framework.Logf(" └─ Member status: UNKNOWN (etcdClientFactory not provided)")
@@ -805,35 +920,6 @@ func getNodeRoles(node *corev1.Node) string {
805920
return strings.Join(roles, ",")
806921
}
807922

808-
// checkEtcdMemberPromotionStatus queries the etcd API to determine if a specific member is promoted (voting) or learner.
809-
// Returns "voting", "learner", or "unknown" based on the actual member status from etcd.
810-
func checkEtcdMemberPromotionStatus(oc *exutil.CLI, nodeName string, etcdClientFactory *helpers.EtcdClientFactoryImpl) string {
811-
etcdClient, closeFn, err := etcdClientFactory.NewEtcdClient()
812-
if err != nil {
813-
return "unknown"
814-
}
815-
defer closeFn()
816-
817-
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
818-
defer cancel()
819-
memberList, err := etcdClient.MemberList(ctx)
820-
if err != nil {
821-
return "unknown"
822-
}
823-
824-
// Find the member corresponding to this node
825-
for _, member := range memberList.Members {
826-
if member.Name == nodeName {
827-
if member.IsLearner {
828-
return "learner"
829-
}
830-
return "voting"
831-
}
832-
}
833-
834-
return "unknown"
835-
}
836-
837923
// checkCEORevisionControllerStatus checks the status of the Cluster Etcd Operator revision controller
838924
func checkCEORevisionControllerStatus(oc *exutil.CLI) error {
839925
framework.Logf("Checking CEO revision controller status...")

0 commit comments

Comments
 (0)