Skip to content

Commit 2a1c975

Browse files
Merge pull request #30812 from jaypoulz/tnf-skip-aggresively
NO-JIRA: refactor(tnf): separate precondition timeouts from recovery cluster health
2 parents 7563f3d + 6013d7f commit 2a1c975

1 file changed

Lines changed: 24 additions & 16 deletions

File tree

test/extended/two_node/utils/common.go

Lines changed: 24 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,15 @@ const (
4141
CLIPrivilegeAdmin = true // Admin CLI with cluster-admin permissions
4242
KubeletPort = "10250" // Kubelet API port
4343

44-
// Common timeouts used across TNF tests
44+
// Wait for cluster health after recovery/reboot (e.g. node replacement).
4545
clusterIsHealthyTimeout = 15 * time.Minute
46-
debugContainerTimeout = 60 * time.Second
46+
47+
// Precondition timeouts for SkipIfClusterIsNotHealthy.
48+
preconditionClusterHealthyTimeout = 5 * time.Minute // nodes + cluster operators
49+
preconditionEtcdHealthyTimeout = 1 * time.Minute // etcd pods running, two voting members
50+
51+
// Max time for a single debug pod exec.
52+
debugContainerTimeout = 60 * time.Second
4753

4854
// Common poll intervals used across TNF tests
4955
FiveSecondPollInterval = 5 * time.Second // Default poll interval for most operations
@@ -147,19 +153,19 @@ func SkipIfClusterIsNotHealthy(oc *exutil.CLI, ecf *helpers.EtcdClientFactoryImp
147153
skipReasons = append(skipReasons, fmt.Sprintf("expected 2 nodes for two-node cluster, found %d", len(nodes.Items)))
148154
}
149155

150-
if err := IsClusterHealthyWithTimeout(oc, clusterIsHealthyTimeout); err != nil {
156+
if err := IsClusterHealthyWithTimeout(oc, preconditionClusterHealthyTimeout); err != nil {
151157
skipReasons = append(skipReasons, fmt.Sprintf("cluster-wide health failed: %v", err))
152158
}
153-
if err := ensureEtcdPodsAreRunning(oc); err != nil {
159+
if err := ensureEtcdPodsAreRunning(oc, preconditionEtcdHealthyTimeout); err != nil {
154160
skipReasons = append(skipReasons, fmt.Sprintf("etcd pods not running: %v", err))
155161
}
156162
// Only check etcd members if we successfully retrieved nodes
157163
if nodes != nil && len(nodes.Items) == 2 {
158-
if err := ensureEtcdHasTwoVotingMembers(nodes, ecf); err != nil {
164+
if err := ensureEtcdHasTwoVotingMembers(nodes, ecf, preconditionEtcdHealthyTimeout); err != nil {
159165
skipReasons = append(skipReasons, fmt.Sprintf("etcd doesn't have two voting members: %v", err))
160166
}
161167
}
162-
if err := ensureClusterOperatorHealthy(oc); err != nil {
168+
if err := ensureClusterOperatorHealthy(oc, preconditionClusterHealthyTimeout); err != nil {
163169
skipReasons = append(skipReasons, fmt.Sprintf("cluster-etcd-operator not healthy: %v", err))
164170
}
165171

@@ -1138,10 +1144,10 @@ func GetMemberState(node *corev1.Node, members []*etcdserverpb.Member) (started,
11381144
return started, learner, nil
11391145
}
11401146

1141-
// ensureClusterOperatorHealthy checks if the cluster-etcd-operator is healthy before running etcd tests
1142-
func ensureClusterOperatorHealthy(oc *exutil.CLI) error {
1143-
framework.Logf("Ensure cluster-etcd-operator is healthy (timeout: %v)", clusterIsHealthyTimeout)
1144-
ctx, cancel := context.WithTimeout(context.Background(), clusterIsHealthyTimeout)
1147+
// ensureClusterOperatorHealthy checks if the cluster-etcd-operator is healthy. timeout is the maximum wait.
1148+
func ensureClusterOperatorHealthy(oc *exutil.CLI, timeout time.Duration) error {
1149+
framework.Logf("Ensure cluster-etcd-operator is healthy (timeout: %v)", timeout)
1150+
ctx, cancel := context.WithTimeout(context.Background(), timeout)
11451151
defer cancel()
11461152

11471153
for {
@@ -1176,9 +1182,10 @@ func ensureClusterOperatorHealthy(oc *exutil.CLI) error {
11761182
}
11771183
}
11781184

1179-
func ensureEtcdPodsAreRunning(oc *exutil.CLI) error {
1180-
framework.Logf("Ensure Etcd pods are running (timeout: %v)", clusterIsHealthyTimeout)
1181-
ctx, cancel := context.WithTimeout(context.Background(), clusterIsHealthyTimeout)
1185+
// ensureEtcdPodsAreRunning waits for etcd pods to be running. timeout is the maximum wait.
1186+
func ensureEtcdPodsAreRunning(oc *exutil.CLI, timeout time.Duration) error {
1187+
framework.Logf("Ensure Etcd pods are running (timeout: %v)", timeout)
1188+
ctx, cancel := context.WithTimeout(context.Background(), timeout)
11821189
defer cancel()
11831190
for {
11841191
etcdPods, err := oc.AdminKubeClient().CoreV1().Pods("openshift-etcd").List(context.Background(), metav1.ListOptions{
@@ -1220,9 +1227,10 @@ func findClusterOperatorCondition(conditions []v1.ClusterOperatorStatusCondition
12201227
return nil
12211228
}
12221229

1223-
func ensureEtcdHasTwoVotingMembers(nodes *corev1.NodeList, ecf *helpers.EtcdClientFactoryImpl) error {
1224-
framework.Logf("Ensure Etcd member list has two voting members (timeout: %v)", clusterIsHealthyTimeout)
1225-
ctx, cancel := context.WithTimeout(context.Background(), clusterIsHealthyTimeout)
1230+
// ensureEtcdHasTwoVotingMembers waits for etcd to have two voting members. timeout is the maximum wait.
1231+
func ensureEtcdHasTwoVotingMembers(nodes *corev1.NodeList, ecf *helpers.EtcdClientFactoryImpl, timeout time.Duration) error {
1232+
framework.Logf("Ensure Etcd member list has two voting members (timeout: %v)", timeout)
1233+
ctx, cancel := context.WithTimeout(context.Background(), timeout)
12261234
defer cancel()
12271235

12281236
for {

0 commit comments

Comments
 (0)