Skip to content

Commit 83def2e

Browse files
Merge pull request #30290 from dhensel-rh/OCPEDGE-1484
OCPEDGE-1484: [TNF] kubelet disruption test
2 parents 9b50a28 + 4ecaf1f commit 83def2e

3 files changed

Lines changed: 857 additions & 6 deletions

File tree

pkg/testsuites/standard_suites.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -428,7 +428,9 @@ var staticSuites = []ginkgo.TestSuite{
428428
Qualifiers: []string{
429429
`name.contains("[Suite:openshift/two-node") || name.contains("[OCPFeatureGate:DualReplica]") || name.contains("[OCPFeatureGate:HighlyAvailableArbiter]")`,
430430
},
431-
TestTimeout: 60 * time.Minute,
431+
TestTimeout: 60 * time.Minute,
432+
Parallelism: 1, // Tests must run serially as they involve node reboots and fencing
433+
ClusterStabilityDuringTest: ginkgo.Disruptive,
432434
},
433435
{
434436
Name: "openshift/auth/external-oidc",
Lines changed: 238 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,238 @@
1+
package two_node
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"time"
7+
8+
g "github.com/onsi/ginkgo/v2"
9+
o "github.com/onsi/gomega"
10+
v1 "github.com/openshift/api/config/v1"
11+
"github.com/openshift/origin/test/extended/etcd/helpers"
12+
"github.com/openshift/origin/test/extended/two_node/utils"
13+
exutil "github.com/openshift/origin/test/extended/util"
14+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
15+
nodeutil "k8s.io/kubernetes/pkg/util/node"
16+
"k8s.io/kubernetes/test/e2e/framework"
17+
)
18+
19+
const (
20+
kubeletDisruptionTimeout = 10 * time.Minute // Timeout for kubelet disruption scenarios
21+
kubeletRestoreTimeout = 5 * time.Minute // Time to wait for kubelet service restore
22+
kubeletPollInterval = 10 * time.Second // Poll interval for kubelet status checks
23+
kubeletGracePeriod = 30 * time.Second // Grace period for kubelet to start/stop
24+
)
25+
26+
var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:DualReplica][Suite:openshift/two-node][Serial][Slow][Disruptive] Two Node with Fencing cluster", func() {
27+
defer g.GinkgoRecover()
28+
29+
var (
30+
oc = exutil.NewCLIWithoutNamespace("two-node-kubelet").AsAdmin()
31+
etcdClientFactory *helpers.EtcdClientFactoryImpl
32+
)
33+
34+
g.BeforeEach(func() {
35+
utils.SkipIfNotTopology(oc, v1.DualReplicaTopologyMode)
36+
37+
nodes, err := utils.GetNodes(oc, utils.AllNodes)
38+
o.Expect(err).ShouldNot(o.HaveOccurred(), "Expected to retrieve nodes without error")
39+
o.Expect(len(nodes.Items)).To(o.Equal(2), "Expected to find exactly 2 nodes for two-node cluster")
40+
41+
etcdClientFactory = helpers.NewEtcdClientFactory(oc.KubeClient())
42+
43+
utils.SkipIfClusterIsNotHealthy(oc, etcdClientFactory, nodes)
44+
})
45+
46+
g.AfterEach(func() {
47+
// Cleanup: Wait for both nodes to become healthy before performing cleanup operations.
48+
// If nodes don't recover, the test fails (as it should for a recovery test).
49+
g.By("Cleanup: Waiting for both nodes to become Ready")
50+
o.Eventually(func() error {
51+
nodeList, err := utils.GetNodes(oc, utils.AllNodes)
52+
if err != nil {
53+
return fmt.Errorf("failed to retrieve nodes: %v", err)
54+
}
55+
56+
if len(nodeList.Items) != 2 {
57+
return fmt.Errorf("expected 2 nodes, found %d", len(nodeList.Items))
58+
}
59+
60+
// Verify both nodes are Ready
61+
for _, node := range nodeList.Items {
62+
nodeObj, err := oc.AdminKubeClient().CoreV1().Nodes().Get(context.Background(), node.Name, metav1.GetOptions{})
63+
if err != nil {
64+
return fmt.Errorf("failed to get node %s: %v", node.Name, err)
65+
}
66+
if !nodeutil.IsNodeReady(nodeObj) {
67+
return fmt.Errorf("node %s is not Ready", node.Name)
68+
}
69+
}
70+
71+
framework.Logf("Both nodes are Ready")
72+
return nil
73+
}, kubeletRestoreTimeout, kubeletPollInterval).Should(o.Succeed(), "Both nodes must be Ready before cleanup")
74+
75+
// Both nodes are now healthy - perform cleanup operations
76+
nodeList, _ := utils.GetNodes(oc, utils.AllNodes)
77+
cleanupNode := nodeList.Items[1] // Use second node for cleanup commands
78+
79+
g.By(fmt.Sprintf("Cleanup: Clearing any kubelet resource bans using node %s", cleanupNode.Name))
80+
cleanupErr := utils.RemoveConstraint(oc, cleanupNode.Name, "kubelet-clone")
81+
if cleanupErr != nil {
82+
framework.Logf("Warning: Failed to clear kubelet-clone resource: %v (expected if no bans were active)", cleanupErr)
83+
} else {
84+
framework.Logf("Successfully cleared kubelet-clone resource bans and failures")
85+
}
86+
87+
g.By("Cleanup: Validating etcd cluster health")
88+
o.Eventually(func() error {
89+
return utils.LogEtcdClusterStatus(oc, "AfterEach cleanup", etcdClientFactory)
90+
}, kubeletRestoreTimeout, kubeletPollInterval).Should(o.Succeed(), "Etcd cluster must be healthy after cleanup")
91+
})
92+
93+
g.It("should recover from single node kubelet service disruption", func() {
94+
nodeList, err := utils.GetNodes(oc, utils.AllNodes)
95+
o.Expect(err).ShouldNot(o.HaveOccurred(), "Expected to retrieve nodes without error")
96+
o.Expect(len(nodeList.Items)).To(o.Equal(2), "Expected to find exactly 2 nodes for two-node cluster")
97+
98+
nodes := nodeList.Items
99+
100+
g.By("Ensuring both nodes are healthy before starting kubelet disruption test")
101+
for _, node := range nodes {
102+
if ready := nodeutil.IsNodeReady(&node); !ready {
103+
o.Expect(ready).Should(o.BeTrue(), fmt.Sprintf("Node %s should be ready before kubelet disruption", node.Name))
104+
}
105+
}
106+
107+
targetNode := nodes[0]
108+
survivingNode := nodes[1]
109+
110+
g.By(fmt.Sprintf("Banning kubelet resource from node: %s", targetNode.Name))
111+
err = utils.AddConstraint(oc, survivingNode.Name, "kubelet-clone", targetNode.Name)
112+
o.Expect(err).To(o.BeNil(), fmt.Sprintf("Expected to ban kubelet resource from node %s without errors", targetNode.Name))
113+
114+
// Register cleanup to ensure ban is removed even if test fails
115+
g.DeferCleanup(func() {
116+
framework.Logf("DeferCleanup: Ensuring kubelet-clone ban is removed")
117+
cleanupErr := utils.RemoveConstraint(oc, survivingNode.Name, "kubelet-clone")
118+
if cleanupErr != nil {
119+
framework.Logf("DeferCleanup: Warning: Failed to clear kubelet-clone ban: %v (this is expected if already cleared)", cleanupErr)
120+
} else {
121+
framework.Logf("DeferCleanup: Successfully cleared kubelet-clone ban")
122+
}
123+
})
124+
125+
g.By(fmt.Sprintf("Checking that node %s is not in state Ready due to kubelet resource ban", targetNode.Name))
126+
o.Eventually(func() bool {
127+
nodeObj, err := oc.AdminKubeClient().CoreV1().Nodes().Get(context.Background(), targetNode.Name, metav1.GetOptions{})
128+
if err != nil {
129+
framework.Logf("Error getting node %s: %v", targetNode.Name, err)
130+
return false
131+
}
132+
return !nodeutil.IsNodeReady(nodeObj)
133+
}, kubeletDisruptionTimeout, kubeletPollInterval).Should(o.BeTrue(), fmt.Sprintf("Node %s is not in state Ready after kubelet resource ban is applied", targetNode.Name))
134+
135+
g.By("Validating etcd cluster remains healthy with surviving node")
136+
o.Consistently(func() error {
137+
return helpers.EnsureHealthyMember(g.GinkgoT(), etcdClientFactory, survivingNode.Name)
138+
}, 5*time.Minute, pollInterval).ShouldNot(o.HaveOccurred(), fmt.Sprintf("etcd member %s should remain healthy during kubelet disruption", survivingNode.Name))
139+
140+
g.By("Clearing kubelet resource bans to allow normal operation")
141+
err = utils.RemoveConstraint(oc, survivingNode.Name, "kubelet-clone")
142+
o.Expect(err).To(o.BeNil(), "Expected to clear kubelet resource bans without errors")
143+
144+
g.By("Validating both nodes are Ready")
145+
for _, node := range nodes {
146+
o.Eventually(func() bool {
147+
nodeObj, err := oc.AdminKubeClient().CoreV1().Nodes().Get(context.Background(), node.Name, metav1.GetOptions{})
148+
if err != nil {
149+
return false
150+
}
151+
return nodeutil.IsNodeReady(nodeObj)
152+
}, kubeletRestoreTimeout, kubeletPollInterval).Should(o.BeTrue(), fmt.Sprintf("Node %s should be Ready", node.Name))
153+
}
154+
155+
g.By("Validating etcd cluster fully recovered")
156+
o.Eventually(func() error {
157+
return utils.LogEtcdClusterStatus(oc, "after resource ban removal", etcdClientFactory)
158+
}, kubeletRestoreTimeout, pollInterval).ShouldNot(o.HaveOccurred(), "etcd cluster should be healthy")
159+
160+
g.By("Validating essential operators available")
161+
o.Eventually(func() error {
162+
return utils.ValidateEssentialOperatorsAvailable(oc)
163+
}, kubeletRestoreTimeout, pollInterval).ShouldNot(o.HaveOccurred(), "Essential operators should be available")
164+
})
165+
166+
g.It("should properly stop kubelet service and verify automatic restart on target node", func() {
167+
nodeList, err := utils.GetNodes(oc, utils.AllNodes)
168+
o.Expect(err).ShouldNot(o.HaveOccurred(), "Expected to retrieve nodes without error")
169+
o.Expect(len(nodeList.Items)).To(o.Equal(2), "Expected to find exactly 2 nodes for two-node cluster")
170+
171+
nodes := nodeList.Items
172+
173+
g.By("Ensuring both nodes are healthy before starting kubelet disruption test")
174+
for _, node := range nodes {
175+
o.Eventually(func() bool {
176+
nodeObj, err := oc.AdminKubeClient().CoreV1().Nodes().Get(context.Background(), node.Name, metav1.GetOptions{})
177+
if err != nil {
178+
framework.Logf("Error getting node %s: %v", node.Name, err)
179+
return false
180+
}
181+
return nodeutil.IsNodeReady(nodeObj)
182+
}, nodeIsHealthyTimeout, pollInterval).Should(o.BeTrue(), fmt.Sprintf("Node %s should be ready before kubelet disruption", node.Name))
183+
}
184+
185+
targetNode := nodes[0]
186+
survivingNode := nodes[1]
187+
188+
g.By(fmt.Sprintf("Verifying kubelet service is initially running on target node: %s", targetNode.Name))
189+
o.Eventually(func() bool {
190+
isRunning := utils.IsServiceRunning(oc, survivingNode.Name, targetNode.Name, "kubelet")
191+
return isRunning
192+
}, kubeletGracePeriod, kubeletPollInterval).Should(o.BeTrue(), fmt.Sprintf("Kubelet service should be running initially on node %s", targetNode.Name))
193+
194+
// Record the time before stopping kubelet to filter failures
195+
stopTime := time.Now()
196+
197+
g.By(fmt.Sprintf("Stopping kubelet service on target node: %s", targetNode.Name))
198+
err = utils.StopKubeletService(oc, targetNode.Name)
199+
o.Expect(err).To(o.BeNil(), fmt.Sprintf("Expected to stop kubelet service on node %s without errors", targetNode.Name))
200+
201+
g.By("Waiting for Pacemaker to auto-recover and restart kubelet-clone service")
202+
o.Eventually(func() bool {
203+
isRunning := utils.IsServiceRunning(oc, survivingNode.Name, targetNode.Name, "kubelet")
204+
framework.Logf("Kubelet running on %s: %v", targetNode.Name, isRunning)
205+
return isRunning
206+
}, kubeletRestoreTimeout, kubeletPollInterval).Should(o.BeTrue(), fmt.Sprintf("Kubelet should be running on %s after Pacemaker restart", targetNode.Name))
207+
208+
g.By("Verifying Pacemaker recorded the kubelet failure in operation history")
209+
// Use a time window from when we stopped kubelet to now
210+
failureWindow := time.Since(stopTime) + time.Minute // Add buffer for clock skew
211+
hasFailure, failures, err := utils.HasRecentResourceFailure(oc, survivingNode.Name, "kubelet-clone", failureWindow)
212+
o.Expect(err).To(o.BeNil(), "Expected to check resource failure history without errors")
213+
o.Expect(hasFailure).To(o.BeTrue(), "Pacemaker should have recorded kubelet failure in operation history")
214+
framework.Logf("Pacemaker recorded %d failure(s) for kubelet-clone: %+v", len(failures), failures)
215+
216+
g.By("Validating both nodes are Ready after Pacemaker restart")
217+
for _, node := range nodes {
218+
o.Eventually(func() bool {
219+
nodeObj, err := oc.AdminKubeClient().CoreV1().Nodes().Get(context.Background(), node.Name, metav1.GetOptions{})
220+
if err != nil {
221+
return false
222+
}
223+
return nodeutil.IsNodeReady(nodeObj)
224+
}, kubeletRestoreTimeout, kubeletPollInterval).Should(o.BeTrue(), fmt.Sprintf("Node %s should be Ready", node.Name))
225+
}
226+
227+
g.By("Validating etcd cluster fully recovered")
228+
o.Eventually(func() error {
229+
return utils.LogEtcdClusterStatus(oc, "after kubelet restart", etcdClientFactory)
230+
}, kubeletRestoreTimeout, pollInterval).ShouldNot(o.HaveOccurred(), "etcd cluster should be healthy")
231+
232+
g.By("Validating essential operators available")
233+
o.Eventually(func() error {
234+
return utils.ValidateEssentialOperatorsAvailable(oc)
235+
}, kubeletRestoreTimeout, pollInterval).ShouldNot(o.HaveOccurred(), "Essential operators should be available")
236+
})
237+
238+
})

0 commit comments

Comments
 (0)