@@ -41,9 +41,15 @@ const (
4141 CLIPrivilegeAdmin = true // Admin CLI with cluster-admin permissions
4242 KubeletPort = "10250" // Kubelet API port
4343
44- // Common timeouts used across TNF tests
44+ // Wait for cluster health after recovery/reboot (e.g. node replacement).
4545 clusterIsHealthyTimeout = 15 * time .Minute
46- debugContainerTimeout = 60 * time .Second
46+
47+ // Precondition timeouts for SkipIfClusterIsNotHealthy.
48+ preconditionClusterHealthyTimeout = 5 * time .Minute // nodes + cluster operators
49+ preconditionEtcdHealthyTimeout = 1 * time .Minute // etcd pods running, two voting members
50+
51+ // Max time for a single debug pod exec.
52+ debugContainerTimeout = 60 * time .Second
4753
4854 // Common poll intervals used across TNF tests
4955 FiveSecondPollInterval = 5 * time .Second // Default poll interval for most operations
@@ -147,19 +153,19 @@ func SkipIfClusterIsNotHealthy(oc *exutil.CLI, ecf *helpers.EtcdClientFactoryImp
147153 skipReasons = append (skipReasons , fmt .Sprintf ("expected 2 nodes for two-node cluster, found %d" , len (nodes .Items )))
148154 }
149155
150- if err := IsClusterHealthyWithTimeout (oc , clusterIsHealthyTimeout ); err != nil {
156+ if err := IsClusterHealthyWithTimeout (oc , preconditionClusterHealthyTimeout ); err != nil {
151157 skipReasons = append (skipReasons , fmt .Sprintf ("cluster-wide health failed: %v" , err ))
152158 }
153- if err := ensureEtcdPodsAreRunning (oc ); err != nil {
159+ if err := ensureEtcdPodsAreRunning (oc , preconditionEtcdHealthyTimeout ); err != nil {
154160 skipReasons = append (skipReasons , fmt .Sprintf ("etcd pods not running: %v" , err ))
155161 }
156162 // Only check etcd members if we successfully retrieved nodes
157163 if nodes != nil && len (nodes .Items ) == 2 {
158- if err := ensureEtcdHasTwoVotingMembers (nodes , ecf ); err != nil {
164+ if err := ensureEtcdHasTwoVotingMembers (nodes , ecf , preconditionEtcdHealthyTimeout ); err != nil {
159165 skipReasons = append (skipReasons , fmt .Sprintf ("etcd doesn't have two voting members: %v" , err ))
160166 }
161167 }
162- if err := ensureClusterOperatorHealthy (oc ); err != nil {
168+ if err := ensureClusterOperatorHealthy (oc , preconditionClusterHealthyTimeout ); err != nil {
163169 skipReasons = append (skipReasons , fmt .Sprintf ("cluster-etcd-operator not healthy: %v" , err ))
164170 }
165171
@@ -1138,10 +1144,10 @@ func GetMemberState(node *corev1.Node, members []*etcdserverpb.Member) (started,
11381144 return started , learner , nil
11391145}
11401146
1141- // ensureClusterOperatorHealthy checks if the cluster-etcd-operator is healthy before running etcd tests
1142- func ensureClusterOperatorHealthy (oc * exutil.CLI ) error {
1143- framework .Logf ("Ensure cluster-etcd-operator is healthy (timeout: %v)" , clusterIsHealthyTimeout )
1144- ctx , cancel := context .WithTimeout (context .Background (), clusterIsHealthyTimeout )
1147+ // ensureClusterOperatorHealthy checks if the cluster-etcd-operator is healthy. timeout is the maximum wait.
1148+ func ensureClusterOperatorHealthy (oc * exutil.CLI , timeout time. Duration ) error {
1149+ framework .Logf ("Ensure cluster-etcd-operator is healthy (timeout: %v)" , timeout )
1150+ ctx , cancel := context .WithTimeout (context .Background (), timeout )
11451151 defer cancel ()
11461152
11471153 for {
@@ -1176,9 +1182,10 @@ func ensureClusterOperatorHealthy(oc *exutil.CLI) error {
11761182 }
11771183}
11781184
1179- func ensureEtcdPodsAreRunning (oc * exutil.CLI ) error {
1180- framework .Logf ("Ensure Etcd pods are running (timeout: %v)" , clusterIsHealthyTimeout )
1181- ctx , cancel := context .WithTimeout (context .Background (), clusterIsHealthyTimeout )
1185+ // ensureEtcdPodsAreRunning waits for etcd pods to be running. timeout is the maximum wait.
1186+ func ensureEtcdPodsAreRunning (oc * exutil.CLI , timeout time.Duration ) error {
1187+ framework .Logf ("Ensure Etcd pods are running (timeout: %v)" , timeout )
1188+ ctx , cancel := context .WithTimeout (context .Background (), timeout )
11821189 defer cancel ()
11831190 for {
11841191 etcdPods , err := oc .AdminKubeClient ().CoreV1 ().Pods ("openshift-etcd" ).List (context .Background (), metav1.ListOptions {
@@ -1220,9 +1227,10 @@ func findClusterOperatorCondition(conditions []v1.ClusterOperatorStatusCondition
12201227 return nil
12211228}
12221229
1223- func ensureEtcdHasTwoVotingMembers (nodes * corev1.NodeList , ecf * helpers.EtcdClientFactoryImpl ) error {
1224- framework .Logf ("Ensure Etcd member list has two voting members (timeout: %v)" , clusterIsHealthyTimeout )
1225- ctx , cancel := context .WithTimeout (context .Background (), clusterIsHealthyTimeout )
1230+ // ensureEtcdHasTwoVotingMembers waits for etcd to have two voting members. timeout is the maximum wait.
1231+ func ensureEtcdHasTwoVotingMembers (nodes * corev1.NodeList , ecf * helpers.EtcdClientFactoryImpl , timeout time.Duration ) error {
1232+ framework .Logf ("Ensure Etcd member list has two voting members (timeout: %v)" , timeout )
1233+ ctx , cancel := context .WithTimeout (context .Background (), timeout )
12261234 defer cancel ()
12271235
12281236 for {
0 commit comments