|
5 | 5 | "fmt" |
6 | 6 | "math/rand" |
7 | 7 | "os" |
| 8 | + "strings" |
8 | 9 | "time" |
9 | 10 |
|
10 | 11 | g "github.com/onsi/ginkgo/v2" |
@@ -413,6 +414,66 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual |
413 | 414 | &targetNode, true, false, // targetNode expected started == true, learner == false |
414 | 415 | 6*time.Minute, 45*time.Second) |
415 | 416 | }) |
| 417 | + |
| 418 | + g.It("should leave a backup container behind for debugging when etcd container crashes", func() { |
| 419 | + survivedNode := peerNode |
| 420 | + |
| 421 | + g.By("Recording epoch timestamp before reboot") |
| 422 | + epochStr, err := exutil.DebugNodeRetryWithOptionsAndChroot(oc, targetNode.Name, "openshift-etcd", |
| 423 | + "bash", "-c", "date +%s") |
| 424 | + o.Expect(err).To(o.BeNil()) |
| 425 | + rebootEpoch := strings.TrimSpace(epochStr) |
| 426 | + |
| 427 | + g.By(fmt.Sprintf("Cleaning up any stale etcd-previous container on %s", targetNode.Name)) |
| 428 | + _, _ = exutil.DebugNodeRetryWithOptionsAndChroot(oc, targetNode.Name, "openshift-etcd", |
| 429 | + "bash", "-c", "podman rm -f etcd-previous 2>/dev/null || true") |
| 430 | + |
| 431 | + g.By(fmt.Sprintf("Removing /var/lib/etcd/pod.yaml on %s", targetNode.Name)) |
| 432 | + _, err = exutil.DebugNodeRetryWithOptionsAndChroot(oc, targetNode.Name, "openshift-etcd", |
| 433 | + "bash", "-c", "rm -f /var/lib/etcd/pod.yaml") |
| 434 | + o.Expect(err).To(o.BeNil(), "Expected to remove pod.yaml without error") |
| 435 | + |
| 436 | + g.By(fmt.Sprintf("Rebooting %s ungracefully", targetNode.Name)) |
| 437 | + err = exutil.TriggerNodeRebootUngraceful(oc.KubeClient(), targetNode.Name) |
| 438 | + o.Expect(err).To(o.BeNil(), "Expected to trigger ungraceful reboot without error") |
| 439 | + time.Sleep(time.Minute) |
| 440 | + |
| 441 | + g.By(fmt.Sprintf("Ensuring that %s added %s back as learner (timeout: %v)", survivedNode.Name, targetNode.Name, memberIsLeaderTimeout)) |
| 442 | + validateEtcdRecoveryState(oc, etcdClientFactory, |
| 443 | + &survivedNode, |
| 444 | + &targetNode, false, true, |
| 445 | + memberIsLeaderTimeout, utils.FiveSecondPollInterval) |
| 446 | + |
| 447 | + g.By(fmt.Sprintf("Ensuring %s rejoins as learner (timeout: %v)", targetNode.Name, memberRejoinedLearnerTimeout)) |
| 448 | + validateEtcdRecoveryState(oc, etcdClientFactory, |
| 449 | + &survivedNode, |
| 450 | + &targetNode, true, true, |
| 451 | + memberRejoinedLearnerTimeout, utils.FiveSecondPollInterval) |
| 452 | + |
| 453 | + g.By(fmt.Sprintf("Ensuring %s node is promoted back as voting member (timeout: %v)", targetNode.Name, memberPromotedVotingTimeout)) |
| 454 | + validateEtcdRecoveryState(oc, etcdClientFactory, |
| 455 | + &survivedNode, |
| 456 | + &targetNode, true, false, |
| 457 | + memberPromotedVotingTimeout, utils.FiveSecondPollInterval) |
| 458 | + |
| 459 | + g.By(fmt.Sprintf("Verifying etcd container is running on %s", targetNode.Name)) |
| 460 | + got, err := exutil.DebugNodeRetryWithOptionsAndChroot(oc, targetNode.Name, "openshift-etcd", |
| 461 | + strings.Split(ensurePodmanEtcdContainerIsRunning, " ")...) |
| 462 | + o.Expect(err).To(o.BeNil()) |
| 463 | + o.Expect(got).To(o.Equal("'true'"), fmt.Sprintf("expected etcd container running on %s", targetNode.Name)) |
| 464 | + |
| 465 | + g.By(fmt.Sprintf("Verifying etcd-previous container exists on %s", targetNode.Name)) |
| 466 | + prevOutput, err := exutil.DebugNodeRetryWithOptionsAndChroot(oc, targetNode.Name, "openshift-etcd", |
| 467 | + "bash", "-c", "podman ps -a --format '{{.Names}}' | grep -m1 etcd-previous") |
| 468 | + o.Expect(err).To(o.BeNil(), fmt.Sprintf("expected etcd-previous container to exist on %s", targetNode.Name)) |
| 469 | + o.Expect(strings.TrimSpace(prevOutput)).To(o.Equal("etcd-previous"), |
| 470 | + fmt.Sprintf("expected etcd-previous container on %s", targetNode.Name)) |
| 471 | + |
| 472 | + g.By(fmt.Sprintf("Verifying pod.yaml was recreated on %s via pacemaker log", targetNode.Name)) |
| 473 | + _, err = exutil.DebugNodeRetryWithOptionsAndChroot(oc, targetNode.Name, "openshift-etcd", |
| 474 | + "bash", "-c", fmt.Sprintf("journalctl -u pacemaker --since=@%s --no-pager | grep -m1 -i 'a new working copy of /etc/kubernetes/static-pod-resources/etcd-certs/configmaps/external-etcd-pod/pod.yaml was created'", rebootEpoch)) |
| 475 | + o.Expect(err).To(o.BeNil(), "Expected pacemaker log to contain pod.yaml recreation entry after reboot") |
| 476 | + }) |
416 | 477 | }) |
417 | 478 |
|
418 | 479 | func validateEtcdRecoveryState( |
|
0 commit comments