From 8989bc9c85b6c9b00d5f4cce92c35a6bc2fbc380 Mon Sep 17 00:00:00 2001 From: Rafael Garcia Date: Tue, 9 Jun 2026 21:01:35 -0400 Subject: [PATCH] Harden test network cleanup and sweep orphaned iptables rules The Linux integration tests create per-bridge iptables FORWARD/NAT rules. On the shared self-hosted runner these leak: the test-harness cleanup called iptables without the -w lock-wait flag and ignored all errors, so under xtables-lock contention from concurrent CI jobs cleanup failed silently. Rules whose bridge interface no longer exists then accumulated indefinitely (cleanupStaleLinkDownRoutes only handles still-present linkdown routes), inflating every iptables operation over time and contributing to flaky "instance did not reach Running within 45s" timeouts. - Add -w 5 to all harness iptables calls (matches lib/network convention) - Surface cleanup errors to stderr instead of swallowing them; retry deletes on transient lock contention; treat already-gone routes/links as benign - Sweep orphaned test iptables rules (interface gone) once per test binary under the existing subnet lock, scoped to the "hm" test prefix so a non-test hypeman process's "ha" rules are never touched - Remove dead HYPEMAN_TEST_NETWORK_TMPDIR plumbing from test.yml (its Go reader was reverted in #268; re-wiring it would reintroduce per-run lock/lease isolation and break cross-run subnet coordination) Co-Authored-By: Claude Opus 4.8 (1M context) --- .github/workflows/test.yml | 9 -- lib/instances/test_network_config_test.go | 143 +++++++++++++++++++++- 2 files changed, 138 insertions(+), 14 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index e333de51..0497782e 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -74,13 +74,6 @@ jobs: sudo env "PATH=$TEST_PATH" bash -lc "command -v '$bin'" done - - name: Create run-scoped temp directory - run: | - TEST_NETWORK_TMPDIR="/tmp/hm-net-${{ github.run_id }}-${{ github.run_attempt }}" - sudo rm -rf "$TEST_NETWORK_TMPDIR" - mkdir -p "$TEST_NETWORK_TMPDIR" - sudo chown -R "$(id -u):$(id -g)" "$TEST_NETWORK_TMPDIR" - # Avoids rate limits when running the tests # Tests includes pulling, then converting to disk images - name: Login to Docker Hub @@ -141,7 +134,6 @@ jobs: - name: Run tests env: - HYPEMAN_TEST_NETWORK_TMPDIR: /tmp/hm-net-${{ github.run_id }}-${{ github.run_attempt }} GO_TEST_TIMEOUT: 600s # Docker auth for tests running as root (sudo) DOCKER_CONFIG: /home/debianuser/.docker @@ -171,7 +163,6 @@ jobs: echo "$units" | xargs -r sudo systemctl reset-failed || true fi sudo rm -f /run/hypeman/uffd/ci-${{ github.run_id }}-${{ github.run_attempt }}-*.env - sudo rm -rf "/tmp/hm-net-${{ github.run_id }}-${{ github.run_attempt }}" rm -f "${{ runner.temp }}/hypeman-uffd-pager-${{ github.run_id }}-${{ github.run_attempt }}" test-darwin: diff --git a/lib/instances/test_network_config_test.go b/lib/instances/test_network_config_test.go index c26efcdb..0df71cb8 100644 --- a/lib/instances/test_network_config_test.go +++ b/lib/instances/test_network_config_test.go @@ -8,6 +8,7 @@ import ( "os" "os/exec" "path/filepath" + "regexp" "runtime" "strconv" "strings" @@ -32,6 +33,34 @@ const ( testSubnetThirdOctetMax = 250 ) +// iptablesWaitSeconds matches the production convention in +// lib/network/bridge_linux.go so the test harness cooperates with the +// system-wide xtables lock instead of failing immediately under contention. +const testIPTablesWaitSeconds = "5" + +// iptablesDeleteAttempts is the number of times we retry an iptables delete on +// transient lock contention before giving up. +const iptablesDeleteAttempts = 3 + +// newTestIPTablesCommand builds an iptables command with the -w wait flag so it +// waits for the xtables lock rather than failing immediately. +func newTestIPTablesCommand(args ...string) *exec.Cmd { + fullArgs := make([]string, 0, len(args)+2) + fullArgs = append(fullArgs, "-w", testIPTablesWaitSeconds) + fullArgs = append(fullArgs, args...) + return exec.Command("iptables", fullArgs...) +} + +// logTestNetworkErr surfaces a real cleanup failure to stderr. These helpers run +// both with and without a *testing.T (e.g. under testNetworkGuardCleanupOnce and +// in t.Cleanup), so we log instead of swallowing the error silently. +func logTestNetworkErr(op string, err error) { + if err == nil { + return + } + fmt.Fprintf(os.Stderr, "hypeman-test-network: %s: %v\n", op, err) +} + type testNetworkLease struct { cfg config.NetworkConfig release func() @@ -105,6 +134,10 @@ func allocateTestNetworkLease(testName string, seq uint32) (*testNetworkLease, e testNetworkGuardCleanupOnce.Do(func() { cleanupStaleLinkDownRoutes(routes) + // Sweep iptables rules for test bridges that no longer exist. Once a + // bridge is fully deleted its route is gone too, so linkdown cleanup + // above can't catch these — they would otherwise leak forever. + sweepOrphanedTestIPTablesRules() // Refresh route snapshot after cleanup so subnet selection sees current state. refreshed, refreshErr := listHostRoutes() if refreshErr == nil { @@ -324,6 +357,74 @@ func cleanupStaleLinkDownRoutes(routes []hostRoute) { } } +// testRuleCommentPattern matches hypeman test-harness iptables rule comments and +// captures both the full comment and the referenced bridge name. We deliberately +// anchor on the "hm" test prefix so we never touch "ha"-prefixed rules from a +// real (non-test) hypeman process running on the same host. +var testRuleCommentPattern = regexp.MustCompile(`hypeman-(?:fwd-out|fwd-in|nat)-(hm[0-9a-f]+)`) + +// sweepOrphanedTestIPTablesRules removes hypeman test iptables rules whose +// referenced bridge interface no longer exists. Once a bridge is fully deleted, +// its route disappears, so cleanupStaleLinkDownRoutes can't reach these rules and +// they accumulate indefinitely on the shared CI runner, slowing every iptables +// operation under the xtables lock. +// +// This is conservative and safe: a rule whose bridge interface is gone can never +// affect live traffic, and we never delete a rule whose interface still exists. +func sweepOrphanedTestIPTablesRules() { + sweepOrphanedTestRulesInChain("", "FORWARD") + sweepOrphanedTestRulesInChain("nat", "POSTROUTING") +} + +func sweepOrphanedTestRulesInChain(table, chain string) { + args := []string{} + if table != "" { + args = append(args, "-t", table) + } + args = append(args, "-S", chain) + output, err := newTestIPTablesCommand(args...).Output() + if err != nil { + logTestNetworkErr(fmt.Sprintf("iptables -S %s/%s for orphan sweep", table, chain), err) + return + } + + // Collect comments whose bridge interface is gone. Cache bridge existence and + // dedupe comments so we shell out and delete each orphan only once. + exists := make(map[string]bool) + seen := make(map[string]struct{}) + var orphanedComments []string + for _, line := range strings.Split(string(output), "\n") { + match := testRuleCommentPattern.FindStringSubmatch(line) + if match == nil { + continue + } + comment := match[0] + bridge := match[1] + + alive, checked := exists[bridge] + if !checked { + alive = bridgeExists(bridge) + exists[bridge] = alive + } + if alive { + // Never delete a rule whose bridge interface still exists. + continue + } + + if _, ok := seen[comment]; ok { + continue + } + seen[comment] = struct{}{} + orphanedComments = append(orphanedComments, comment) + } + + // Delete by comment, reusing the existing line-number-based deleter which + // handles quoting and renumbering correctly. + for _, comment := range orphanedComments { + deleteIPTablesRulesByComment(table, chain, comment) + } +} + func pruneStaleLeases(leases map[string]subnetLease, routes []hostRoute) { liveRoutes := make(map[string]struct{}, len(routes)) for _, route := range routes { @@ -422,10 +523,18 @@ func isTestCIDR(cidr string) bool { func cleanupTestNetworkArtifacts(bridgeName, subnetCIDR string) { if subnetCIDR != "" && bridgeName != "" { - _ = exec.Command("ip", "-4", "route", "del", subnetCIDR, "dev", bridgeName).Run() + // The route may already be gone (linkdown cleanup, prior pass) — that is + // not a real failure, so only surface unexpected errors. + out, err := exec.Command("ip", "-4", "route", "del", subnetCIDR, "dev", bridgeName).CombinedOutput() + if err != nil && !isNoSuchObjectErr(out) { + logTestNetworkErr(fmt.Sprintf("ip route del %s dev %s: %s", subnetCIDR, bridgeName, strings.TrimSpace(string(out))), err) + } } if bridgeName != "" { - _ = exec.Command("ip", "link", "delete", bridgeName, "type", "bridge").Run() + out, err := exec.Command("ip", "link", "delete", bridgeName, "type", "bridge").CombinedOutput() + if err != nil && !isNoSuchObjectErr(out) { + logTestNetworkErr(fmt.Sprintf("ip link delete %s: %s", bridgeName, strings.TrimSpace(string(out))), err) + } } bridgeSuffix := strings.ToLower(bridgeName) @@ -434,6 +543,14 @@ func cleanupTestNetworkArtifacts(bridgeName, subnetCIDR string) { deleteIPTablesRulesByComment("", "FORWARD", "hypeman-fwd-in-"+bridgeSuffix) } +// isNoSuchObjectErr reports whether an `ip` command output indicates the object +// (route/link) was already gone, which is an expected, benign outcome for +// idempotent cleanup. +func isNoSuchObjectErr(combinedOutput []byte) bool { + out := strings.ToLower(string(combinedOutput)) + return strings.Contains(out, "cannot find") || strings.Contains(out, "no such") +} + func deleteIPTablesRulesByComment(table, chain, comment string) { if comment == "" { return @@ -444,9 +561,9 @@ func deleteIPTablesRulesByComment(table, chain, comment string) { args = append(args, "-t", table) } args = append(args, "-L", chain, "--line-numbers", "-n") - listCmd := exec.Command("iptables", args...) - output, err := listCmd.Output() + output, err := newTestIPTablesCommand(args...).Output() if err != nil { + logTestNetworkErr(fmt.Sprintf("iptables list %s/%s for comment %q", table, chain, comment), err) return } @@ -472,8 +589,24 @@ func deleteIPTablesRulesByComment(table, chain, comment string) { delArgs = append(delArgs, "-t", table) } delArgs = append(delArgs, "-D", chain, strconv.Itoa(ruleNums[i])) - _ = exec.Command("iptables", delArgs...).Run() + deleteIPTablesRuleWithRetry(delArgs) + } +} + +// deleteIPTablesRuleWithRetry runs an iptables `-D` delete, retrying a few times +// on transient lock contention (the failure mode under concurrent CI jobs). +func deleteIPTablesRuleWithRetry(delArgs []string) { + var err error + for attempt := 0; attempt < iptablesDeleteAttempts; attempt++ { + if attempt > 0 { + time.Sleep(time.Duration(attempt) * 100 * time.Millisecond) + } + err = newTestIPTablesCommand(delArgs...).Run() + if err == nil { + return + } } + logTestNetworkErr(fmt.Sprintf("iptables %s", strings.Join(delArgs, " ")), err) } func legacyParallelTestNetworkConfig(seq uint32) config.NetworkConfig {