Skip to content

Commit d7ad0db

Browse files
Merge pull request #30922 from kasturinarra/automate_case_86079
OCPEDGE-2381: Add test to verify for backup container exists when etcd crashes
2 parents 53cc9e6 + 29d00f7 commit d7ad0db

1 file changed

Lines changed: 61 additions & 0 deletions

File tree

test/extended/two_node/tnf_recovery.go

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"fmt"
66
"math/rand"
77
"os"
8+
"strings"
89
"time"
910

1011
g "github.com/onsi/ginkgo/v2"
@@ -413,6 +414,66 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
413414
&targetNode, true, false, // targetNode expected started == true, learner == false
414415
6*time.Minute, 45*time.Second)
415416
})
417+
418+
g.It("should leave a backup container behind for debugging when etcd container crashes", func() {
419+
survivedNode := peerNode
420+
421+
g.By("Recording epoch timestamp before reboot")
422+
epochStr, err := exutil.DebugNodeRetryWithOptionsAndChroot(oc, targetNode.Name, "openshift-etcd",
423+
"bash", "-c", "date +%s")
424+
o.Expect(err).To(o.BeNil())
425+
rebootEpoch := strings.TrimSpace(epochStr)
426+
427+
g.By(fmt.Sprintf("Cleaning up any stale etcd-previous container on %s", targetNode.Name))
428+
_, _ = exutil.DebugNodeRetryWithOptionsAndChroot(oc, targetNode.Name, "openshift-etcd",
429+
"bash", "-c", "podman rm -f etcd-previous 2>/dev/null || true")
430+
431+
g.By(fmt.Sprintf("Removing /var/lib/etcd/pod.yaml on %s", targetNode.Name))
432+
_, err = exutil.DebugNodeRetryWithOptionsAndChroot(oc, targetNode.Name, "openshift-etcd",
433+
"bash", "-c", "rm -f /var/lib/etcd/pod.yaml")
434+
o.Expect(err).To(o.BeNil(), "Expected to remove pod.yaml without error")
435+
436+
g.By(fmt.Sprintf("Rebooting %s ungracefully", targetNode.Name))
437+
err = exutil.TriggerNodeRebootUngraceful(oc.KubeClient(), targetNode.Name)
438+
o.Expect(err).To(o.BeNil(), "Expected to trigger ungraceful reboot without error")
439+
time.Sleep(time.Minute)
440+
441+
g.By(fmt.Sprintf("Ensuring that %s added %s back as learner (timeout: %v)", survivedNode.Name, targetNode.Name, memberIsLeaderTimeout))
442+
validateEtcdRecoveryState(oc, etcdClientFactory,
443+
&survivedNode,
444+
&targetNode, false, true,
445+
memberIsLeaderTimeout, utils.FiveSecondPollInterval)
446+
447+
g.By(fmt.Sprintf("Ensuring %s rejoins as learner (timeout: %v)", targetNode.Name, memberRejoinedLearnerTimeout))
448+
validateEtcdRecoveryState(oc, etcdClientFactory,
449+
&survivedNode,
450+
&targetNode, true, true,
451+
memberRejoinedLearnerTimeout, utils.FiveSecondPollInterval)
452+
453+
g.By(fmt.Sprintf("Ensuring %s node is promoted back as voting member (timeout: %v)", targetNode.Name, memberPromotedVotingTimeout))
454+
validateEtcdRecoveryState(oc, etcdClientFactory,
455+
&survivedNode,
456+
&targetNode, true, false,
457+
memberPromotedVotingTimeout, utils.FiveSecondPollInterval)
458+
459+
g.By(fmt.Sprintf("Verifying etcd container is running on %s", targetNode.Name))
460+
got, err := exutil.DebugNodeRetryWithOptionsAndChroot(oc, targetNode.Name, "openshift-etcd",
461+
strings.Split(ensurePodmanEtcdContainerIsRunning, " ")...)
462+
o.Expect(err).To(o.BeNil())
463+
o.Expect(got).To(o.Equal("'true'"), fmt.Sprintf("expected etcd container running on %s", targetNode.Name))
464+
465+
g.By(fmt.Sprintf("Verifying etcd-previous container exists on %s", targetNode.Name))
466+
prevOutput, err := exutil.DebugNodeRetryWithOptionsAndChroot(oc, targetNode.Name, "openshift-etcd",
467+
"bash", "-c", "podman ps -a --format '{{.Names}}' | grep -m1 etcd-previous")
468+
o.Expect(err).To(o.BeNil(), fmt.Sprintf("expected etcd-previous container to exist on %s", targetNode.Name))
469+
o.Expect(strings.TrimSpace(prevOutput)).To(o.Equal("etcd-previous"),
470+
fmt.Sprintf("expected etcd-previous container on %s", targetNode.Name))
471+
472+
g.By(fmt.Sprintf("Verifying pod.yaml was recreated on %s via pacemaker log", targetNode.Name))
473+
_, err = exutil.DebugNodeRetryWithOptionsAndChroot(oc, targetNode.Name, "openshift-etcd",
474+
"bash", "-c", fmt.Sprintf("journalctl -u pacemaker --since=@%s --no-pager | grep -m1 -i 'a new working copy of /etc/kubernetes/static-pod-resources/etcd-certs/configmaps/external-etcd-pod/pod.yaml was created'", rebootEpoch))
475+
o.Expect(err).To(o.BeNil(), "Expected pacemaker log to contain pod.yaml recreation entry after reboot")
476+
})
416477
})
417478

418479
func validateEtcdRecoveryState(

0 commit comments

Comments
 (0)