Skip to content

Commit b047597

Browse files
dhensel-rhclaude
andcommitted
OCPEDGE-1484: Add kubelet failure tests in two-node recovery suite
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent e713d4e commit b047597

2 files changed

Lines changed: 771 additions & 5 deletions

File tree

Lines changed: 240 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,240 @@
1+
package two_node
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"time"
7+
8+
g "github.com/onsi/ginkgo/v2"
9+
o "github.com/onsi/gomega"
10+
v1 "github.com/openshift/api/config/v1"
11+
"github.com/openshift/origin/test/extended/etcd/helpers"
12+
"github.com/openshift/origin/test/extended/two_node/utils"
13+
exutil "github.com/openshift/origin/test/extended/util"
14+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
15+
nodeutil "k8s.io/kubernetes/pkg/util/node"
16+
"k8s.io/kubernetes/test/e2e/framework"
17+
)
18+
19+
const (
20+
kubeletDisruptionTimeout = 10 * time.Minute // Timeout for kubelet disruption scenarios
21+
kubeletRestoreTimeout = 5 * time.Minute // Time to wait for kubelet service restore
22+
kubeletPollInterval = 10 * time.Second // Poll interval for kubelet status checks
23+
kubeletGracePeriod = 30 * time.Second // Grace period for kubelet to start/stop
24+
pacemakerMonitorDetectPeriod = 15 * time.Second // Time to wait for Pacemaker to detect kubelet state changes
25+
)
26+
27+
var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:DualReplica][Suite:openshift/two-node][Serial][Slow][Disruptive] Two Node with Fencing cluster", func() {
28+
defer g.GinkgoRecover()
29+
30+
var (
31+
oc = exutil.NewCLIWithoutNamespace("two-node-kubelet").AsAdmin()
32+
etcdClientFactory *helpers.EtcdClientFactoryImpl
33+
)
34+
35+
g.BeforeEach(func() {
36+
utils.SkipIfNotTopology(oc, v1.DualReplicaTopologyMode)
37+
38+
nodes, err := utils.GetNodes(oc, utils.AllNodes)
39+
o.Expect(err).ShouldNot(o.HaveOccurred(), "Expected to retrieve nodes without error")
40+
o.Expect(len(nodes.Items)).To(o.Equal(2), "Expected to find exactly 2 nodes for two-node cluster")
41+
42+
etcdClientFactory = helpers.NewEtcdClientFactory(oc.KubeClient())
43+
44+
utils.SkipIfClusterIsNotHealthy(oc, etcdClientFactory, nodes)
45+
})
46+
47+
g.AfterEach(func() {
48+
// Cleanup: Wait for both nodes to become healthy before performing cleanup operations.
49+
// If nodes don't recover, the test fails (as it should for a recovery test).
50+
g.By("Cleanup: Waiting for both nodes to become Ready")
51+
o.Eventually(func() error {
52+
nodeList, err := utils.GetNodes(oc, utils.AllNodes)
53+
if err != nil {
54+
return fmt.Errorf("failed to retrieve nodes: %v", err)
55+
}
56+
57+
if len(nodeList.Items) != 2 {
58+
return fmt.Errorf("expected 2 nodes, found %d", len(nodeList.Items))
59+
}
60+
61+
// Verify both nodes are Ready
62+
for _, node := range nodeList.Items {
63+
nodeObj, err := oc.AdminKubeClient().CoreV1().Nodes().Get(context.Background(), node.Name, metav1.GetOptions{})
64+
if err != nil {
65+
return fmt.Errorf("failed to get node %s: %v", node.Name, err)
66+
}
67+
if !nodeutil.IsNodeReady(nodeObj) {
68+
return fmt.Errorf("node %s is not Ready", node.Name)
69+
}
70+
}
71+
72+
framework.Logf("Both nodes are Ready")
73+
return nil
74+
}, kubeletRestoreTimeout, kubeletPollInterval).Should(o.Succeed(), "Both nodes must be Ready before cleanup")
75+
76+
// Both nodes are now healthy - perform cleanup operations
77+
nodeList, _ := utils.GetNodes(oc, utils.AllNodes)
78+
cleanupNode := nodeList.Items[1] // Use second node for cleanup commands
79+
80+
g.By(fmt.Sprintf("Cleanup: Clearing any kubelet resource bans using node %s", cleanupNode.Name))
81+
cleanupErr := utils.RemoveConstraint(oc, cleanupNode.Name, "kubelet-clone")
82+
if cleanupErr != nil {
83+
framework.Logf("Warning: Failed to clear kubelet-clone resource: %v (expected if no bans were active)", cleanupErr)
84+
} else {
85+
framework.Logf("Successfully cleared kubelet-clone resource bans and failures")
86+
}
87+
88+
g.By("Cleanup: Validating etcd cluster health")
89+
o.Eventually(func() error {
90+
return utils.LogEtcdClusterStatus(oc, "AfterEach cleanup", etcdClientFactory)
91+
}, kubeletRestoreTimeout, kubeletPollInterval).Should(o.Succeed(), "Etcd cluster must be healthy after cleanup")
92+
})
93+
94+
g.It("should recover from single node kubelet service disruption", func() {
95+
nodeList, err := utils.GetNodes(oc, utils.AllNodes)
96+
o.Expect(err).ShouldNot(o.HaveOccurred(), "Expected to retrieve nodes without error")
97+
o.Expect(len(nodeList.Items)).To(o.Equal(2), "Expected to find exactly 2 nodes for two-node cluster")
98+
99+
nodes := nodeList.Items
100+
101+
g.By("Ensuring both nodes are healthy before starting kubelet disruption test")
102+
for _, node := range nodes {
103+
if ready := nodeutil.IsNodeReady(&node); !ready {
104+
o.Expect(ready).Should(o.BeTrue(), fmt.Sprintf("Node %s should be ready before kubelet disruption", node.Name))
105+
}
106+
}
107+
108+
targetNode := nodes[0]
109+
survivingNode := nodes[1]
110+
111+
g.By(fmt.Sprintf("Banning kubelet resource from node: %s", targetNode.Name))
112+
err = utils.AddConstraint(oc, survivingNode.Name, "kubelet-clone", targetNode.Name)
113+
o.Expect(err).To(o.BeNil(), fmt.Sprintf("Expected to ban kubelet resource from node %s without errors", targetNode.Name))
114+
115+
// Register cleanup to ensure ban is removed even if test fails
116+
g.DeferCleanup(func() {
117+
framework.Logf("DeferCleanup: Ensuring kubelet-clone ban is removed")
118+
cleanupErr := utils.RemoveConstraint(oc, survivingNode.Name, "kubelet-clone")
119+
if cleanupErr != nil {
120+
framework.Logf("DeferCleanup: Warning: Failed to clear kubelet-clone ban: %v (this is expected if already cleared)", cleanupErr)
121+
} else {
122+
framework.Logf("DeferCleanup: Successfully cleared kubelet-clone ban")
123+
}
124+
})
125+
126+
g.By(fmt.Sprintf("Checking that node %s is not in state Ready due to kubelet resource ban", targetNode.Name))
127+
o.Eventually(func() bool {
128+
nodeObj, err := oc.AdminKubeClient().CoreV1().Nodes().Get(context.Background(), targetNode.Name, metav1.GetOptions{})
129+
if err != nil {
130+
framework.Logf("Error getting node %s: %v", targetNode.Name, err)
131+
return false
132+
}
133+
return !nodeutil.IsNodeReady(nodeObj)
134+
}, kubeletDisruptionTimeout, kubeletPollInterval).Should(o.BeTrue(), fmt.Sprintf("Node %s is not in state Ready after kubelet resource ban is applied", targetNode.Name))
135+
136+
g.By("Validating etcd cluster remains healthy with surviving node")
137+
o.Consistently(func() error {
138+
return helpers.EnsureHealthyMember(g.GinkgoT(), etcdClientFactory, survivingNode.Name)
139+
}, 5*time.Minute, pollInterval).ShouldNot(o.HaveOccurred(), fmt.Sprintf("etcd member %s should remain healthy during kubelet disruption", survivingNode.Name))
140+
141+
g.By("Clearing kubelet resource bans to allow normal operation")
142+
err = utils.RemoveConstraint(oc, survivingNode.Name, "kubelet-clone")
143+
o.Expect(err).To(o.BeNil(), "Expected to clear kubelet resource bans without errors")
144+
145+
g.By("Validating both nodes are Ready")
146+
for _, node := range nodes {
147+
o.Eventually(func() bool {
148+
nodeObj, err := oc.AdminKubeClient().CoreV1().Nodes().Get(context.Background(), node.Name, metav1.GetOptions{})
149+
if err != nil {
150+
return false
151+
}
152+
return nodeutil.IsNodeReady(nodeObj)
153+
}, kubeletRestoreTimeout, kubeletPollInterval).Should(o.BeTrue(), fmt.Sprintf("Node %s should be Ready", node.Name))
154+
}
155+
156+
g.By("Validating etcd cluster fully recovered")
157+
o.Eventually(func() error {
158+
return utils.LogEtcdClusterStatus(oc, "after resource ban removal", etcdClientFactory)
159+
}, kubeletRestoreTimeout, pollInterval).ShouldNot(o.HaveOccurred(), "etcd cluster should be healthy")
160+
161+
g.By("Validating essential operators available")
162+
o.Eventually(func() error {
163+
return utils.ValidateEssentialOperatorsAvailable(oc)
164+
}, kubeletRestoreTimeout, pollInterval).ShouldNot(o.HaveOccurred(), "Essential operators should be available")
165+
})
166+
167+
g.It("should properly stop kubelet service and verify automatic restart on target node", func() {
168+
nodeList, err := utils.GetNodes(oc, utils.AllNodes)
169+
o.Expect(err).ShouldNot(o.HaveOccurred(), "Expected to retrieve nodes without error")
170+
o.Expect(len(nodeList.Items)).To(o.Equal(2), "Expected to find exactly 2 nodes for two-node cluster")
171+
172+
nodes := nodeList.Items
173+
174+
g.By("Ensuring both nodes are healthy before starting kubelet disruption test")
175+
for _, node := range nodes {
176+
o.Eventually(func() bool {
177+
nodeObj, err := oc.AdminKubeClient().CoreV1().Nodes().Get(context.Background(), node.Name, metav1.GetOptions{})
178+
if err != nil {
179+
framework.Logf("Error getting node %s: %v", node.Name, err)
180+
return false
181+
}
182+
return nodeutil.IsNodeReady(nodeObj)
183+
}, nodeIsHealthyTimeout, pollInterval).Should(o.BeTrue(), fmt.Sprintf("Node %s should be ready before kubelet disruption", node.Name))
184+
}
185+
186+
targetNode := nodes[0]
187+
survivingNode := nodes[1]
188+
189+
g.By(fmt.Sprintf("Verifying kubelet service is initially running on target node: %s", targetNode.Name))
190+
o.Eventually(func() bool {
191+
isRunning := utils.IsServiceRunning(oc, survivingNode.Name, targetNode.Name, "kubelet")
192+
return isRunning
193+
}, kubeletGracePeriod, kubeletPollInterval).Should(o.BeTrue(), fmt.Sprintf("Kubelet service should be running initially on node %s", targetNode.Name))
194+
195+
g.By(fmt.Sprintf("Stopping kubelet service on target node: %s", targetNode.Name))
196+
err = utils.StopKubeletService(oc, targetNode.Name)
197+
o.Expect(err).To(o.BeNil(), fmt.Sprintf("Expected to stop kubelet service on node %s without errors", targetNode.Name))
198+
199+
g.By("Waiting for Pacemaker to detect kubelet stopped")
200+
time.Sleep(pacemakerMonitorDetectPeriod)
201+
202+
g.By("Verifying Pacemaker monitor detected kubelet as inactive")
203+
journalCmd := "sudo journalctl --no-pager --since '60 seconds ago' | grep 'Result of monitor operation for kubelet' | grep -i 'not running' || true"
204+
logs, logErr := exutil.DebugNodeRetryWithOptionsAndChroot(
205+
oc, targetNode.Name, "default", "bash", "-c", journalCmd)
206+
207+
o.Expect(logErr).To(o.BeNil(), "Should retrieve journal logs")
208+
o.Expect(logs).ToNot(o.BeEmpty(), "Pacemaker should have detected kubelet as 'not running (inactive)'")
209+
framework.Logf("Pacemaker monitor detection: %s", logs)
210+
211+
g.By("Verifying Pacemaker restarted kubelet-clone service")
212+
o.Eventually(func() bool {
213+
isRunning := utils.IsServiceRunning(oc, survivingNode.Name, targetNode.Name, "kubelet")
214+
framework.Logf("Kubelet running on %s: %v", targetNode.Name, isRunning)
215+
return isRunning
216+
}, kubeletRestoreTimeout, kubeletPollInterval).Should(o.BeTrue(), fmt.Sprintf("Kubelet should be running on %s after Pacemaker restart", targetNode.Name))
217+
218+
g.By("Validating both nodes are Ready after Pacemaker restart")
219+
for _, node := range nodes {
220+
o.Eventually(func() bool {
221+
nodeObj, err := oc.AdminKubeClient().CoreV1().Nodes().Get(context.Background(), node.Name, metav1.GetOptions{})
222+
if err != nil {
223+
return false
224+
}
225+
return nodeutil.IsNodeReady(nodeObj)
226+
}, kubeletRestoreTimeout, kubeletPollInterval).Should(o.BeTrue(), fmt.Sprintf("Node %s should be Ready", node.Name))
227+
}
228+
229+
g.By("Validating etcd cluster fully recovered")
230+
o.Eventually(func() error {
231+
return utils.LogEtcdClusterStatus(oc, "after kubelet restart", etcdClientFactory)
232+
}, kubeletRestoreTimeout, pollInterval).ShouldNot(o.HaveOccurred(), "etcd cluster should be healthy")
233+
234+
g.By("Validating essential operators available")
235+
o.Eventually(func() error {
236+
return utils.ValidateEssentialOperatorsAvailable(oc)
237+
}, kubeletRestoreTimeout, pollInterval).ShouldNot(o.HaveOccurred(), "Essential operators should be available")
238+
})
239+
240+
})

0 commit comments

Comments
 (0)