From b79ba613522528d8dd280d3062007bcf6869aafa Mon Sep 17 00:00:00 2001 From: Hiro Tamada Date: Tue, 9 Jun 2026 16:07:09 -0400 Subject: [PATCH 1/3] Stabilize Linux KVM CI on shared runners Limit host-level contention in CI and clean up VM helpers that survive timed-out tests, so Firecracker/QEMU integration runs do not leave pressure on deft-kernel-dev. Co-authored-by: Cursor --- .github/workflows/test.yml | 18 ++++++++++++- Makefile | 11 ++++++-- lib/instances/firecracker_test.go | 2 +- lib/instances/manager_test.go | 44 ++++++++++++++++++++++++++++++- lib/instances/qemu_test.go | 1 + 5 files changed, 71 insertions(+), 5 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index e333de51..a510c5f5 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -6,6 +6,9 @@ on: jobs: test: runs-on: [self-hosted, linux, x64, kvm] + concurrency: + group: linux-kvm-ci-test + cancel-in-progress: false steps: - uses: actions/checkout@v4 with: @@ -77,9 +80,13 @@ jobs: - name: Create run-scoped temp directory run: | TEST_NETWORK_TMPDIR="/tmp/hm-net-${{ github.run_id }}-${{ github.run_attempt }}" + TEST_TMPDIR="/tmp/hypeman-ci-${{ github.run_id }}-${{ github.run_attempt }}" sudo rm -rf "$TEST_NETWORK_TMPDIR" + sudo rm -rf "$TEST_TMPDIR" mkdir -p "$TEST_NETWORK_TMPDIR" + mkdir -p "$TEST_TMPDIR" sudo chown -R "$(id -u):$(id -g)" "$TEST_NETWORK_TMPDIR" + sudo chown -R "$(id -u):$(id -g)" "$TEST_TMPDIR" # Avoids rate limits when running the tests # Tests includes pulling, then converting to disk images @@ -156,11 +163,12 @@ jobs: HYPEMAN_TEST_REGISTRY: 127.0.0.1:5001 HYPEMAN_UFFD_PAGER_BINARY: ${{ runner.temp }}/hypeman-uffd-pager-${{ github.run_id }}-${{ github.run_attempt }} HYPEMAN_UFFD_SYSTEMD_INSTANCE_PREFIX: ci-${{ github.run_id }}-${{ github.run_attempt }} + TMPDIR: /tmp/hypeman-ci-${{ github.run_id }}-${{ github.run_attempt }} run: | cp "$PWD/bin/hypeman-uffd-pager" "$HYPEMAN_UFFD_PAGER_BINARY" chmod +x "$HYPEMAN_UFFD_PAGER_BINARY" export HYPEMAN_TEST_PREWARM_DIR="$HOME/.cache/hypeman-ci/linux-amd64" - make test TEST_TIMEOUT=20m + make test TEST_TIMEOUT=20m GO_TEST_PARALLELISM=4 - name: Cleanup if: always() @@ -170,8 +178,16 @@ jobs: echo "$units" | xargs -r sudo systemctl stop || true echo "$units" | xargs -r sudo systemctl reset-failed || true fi + TEST_TMPDIR="/tmp/hypeman-ci-${{ github.run_id }}-${{ github.run_attempt }}" + stale_pids="$(ps -eo pid=,ppid=,cmd= | awk -v tmpdir="$TEST_TMPDIR" '$2 == 1 && index($0, tmpdir) && /(firecracker|cloud-hypervisor|hypeman-uffd-pager)/ {print $1}' || true)" + if [ -n "$stale_pids" ]; then + echo "$stale_pids" | xargs -r sudo kill || true + sleep 3 + echo "$stale_pids" | xargs -r sudo kill -9 || true + fi sudo rm -f /run/hypeman/uffd/ci-${{ github.run_id }}-${{ github.run_attempt }}-*.env sudo rm -rf "/tmp/hm-net-${{ github.run_id }}-${{ github.run_attempt }}" + sudo rm -rf "$TEST_TMPDIR" rm -f "${{ runner.temp }}/hypeman-uffd-pager-${{ github.run_id }}-${{ github.run_attempt }}" test-darwin: diff --git a/Makefile b/Makefile index 21e9ebcb..a98a0598 100644 --- a/Makefile +++ b/Makefile @@ -4,6 +4,7 @@ SHELL := /bin/bash # Directory where local binaries will be installed BIN_DIR ?= $(CURDIR)/bin GO_TEST_TIMEOUT ?= 300s +GO_TEST_PARALLELISM ?= UFFD_PAGER_BINARY ?= $(BIN_DIR)/hypeman-uffd-pager $(BIN_DIR): @@ -295,25 +296,31 @@ endif # Linux tests (as root for network capabilities) test-linux: ensure-ch-binaries ensure-firecracker-binaries ensure-caddy-binaries build-embedded $(BIN_DIR)/hypeman-uffd-pager @VERBOSE_FLAG=""; \ + PARALLEL_FLAG=""; \ TEST_PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:$$PATH"; \ if [ -n "$(VERBOSE)" ]; then VERBOSE_FLAG="-v"; fi; \ + if [ -n "$(GO_TEST_PARALLELISM)" ]; then PARALLEL_FLAG="-parallel=$(GO_TEST_PARALLELISM)"; fi; \ if [ -n "$(TEST)" ]; then \ echo "Running specific test: $(TEST)"; \ sudo env "PATH=$$TEST_PATH" "DOCKER_CONFIG=$${DOCKER_CONFIG:-$$HOME/.docker}" "CI=$${CI:-}" \ + "TMPDIR=$${TMPDIR:-/tmp}" \ + "HYPEMAN_TEST_NETWORK_TMPDIR=$${HYPEMAN_TEST_NETWORK_TMPDIR:-}" \ "HYPEMAN_UFFD_PAGER_BINARY=$${HYPEMAN_UFFD_PAGER_BINARY:-$(UFFD_PAGER_BINARY)}" \ "HYPEMAN_UFFD_SYSTEMD_INSTANCE_PREFIX=$${HYPEMAN_UFFD_SYSTEMD_INSTANCE_PREFIX:-}" \ "HYPEMAN_TEST_PREWARM_DIR=$${HYPEMAN_TEST_PREWARM_DIR:-}" \ "HYPEMAN_TEST_PREWARM_STRICT=$${HYPEMAN_TEST_PREWARM_STRICT:-}" \ "HYPEMAN_TEST_REGISTRY=$${HYPEMAN_TEST_REGISTRY:-}" \ - go test -tags containers_image_openpgp -run=$(TEST) $$VERBOSE_FLAG -timeout=$(TEST_TIMEOUT) ./...; \ + go test -tags containers_image_openpgp -run=$(TEST) $$VERBOSE_FLAG $$PARALLEL_FLAG -timeout=$(TEST_TIMEOUT) ./...; \ else \ sudo env "PATH=$$TEST_PATH" "DOCKER_CONFIG=$${DOCKER_CONFIG:-$$HOME/.docker}" "CI=$${CI:-}" \ + "TMPDIR=$${TMPDIR:-/tmp}" \ + "HYPEMAN_TEST_NETWORK_TMPDIR=$${HYPEMAN_TEST_NETWORK_TMPDIR:-}" \ "HYPEMAN_UFFD_PAGER_BINARY=$${HYPEMAN_UFFD_PAGER_BINARY:-$(UFFD_PAGER_BINARY)}" \ "HYPEMAN_UFFD_SYSTEMD_INSTANCE_PREFIX=$${HYPEMAN_UFFD_SYSTEMD_INSTANCE_PREFIX:-}" \ "HYPEMAN_TEST_PREWARM_DIR=$${HYPEMAN_TEST_PREWARM_DIR:-}" \ "HYPEMAN_TEST_PREWARM_STRICT=$${HYPEMAN_TEST_PREWARM_STRICT:-}" \ "HYPEMAN_TEST_REGISTRY=$${HYPEMAN_TEST_REGISTRY:-}" \ - go test -tags containers_image_openpgp $$VERBOSE_FLAG -timeout=$(TEST_TIMEOUT) ./...; \ + go test -tags containers_image_openpgp $$VERBOSE_FLAG $$PARALLEL_FLAG -timeout=$(TEST_TIMEOUT) ./...; \ fi # macOS tests (no sudo needed, adds e2fsprogs to PATH) diff --git a/lib/instances/firecracker_test.go b/lib/instances/firecracker_test.go index 111c11b8..fb4efe7a 100644 --- a/lib/instances/firecracker_test.go +++ b/lib/instances/firecracker_test.go @@ -883,7 +883,7 @@ func requireRunningSleepInstance(t *testing.T, ctx context.Context, mgr Manager, return false } return true - }, integrationTestTimeout(30*time.Second), 250*time.Millisecond) + }, integrationTestTimeout(90*time.Second), 250*time.Millisecond) inst, err = mgr.GetInstance(ctx, instanceID) require.NoError(t, err) diff --git a/lib/instances/manager_test.go b/lib/instances/manager_test.go index 88f3a7ec..a1d26afd 100644 --- a/lib/instances/manager_test.go +++ b/lib/instances/manager_test.go @@ -10,6 +10,7 @@ import ( "net/http" "os" "path/filepath" + "strconv" "strings" "syscall" "testing" @@ -186,7 +187,8 @@ func cleanupOrphanedProcesses(t *testing.T, mgr *manager) { // Find all metadata files metaFiles, err := mgr.listMetadataFiles() if err != nil { - return // No metadata files, nothing to clean + cleanupTestHypervisorProcessesByDataDir(t, mgr.paths.DataDir()) + return } for _, metaFile := range metaFiles { @@ -213,6 +215,46 @@ func cleanupOrphanedProcesses(t *testing.T, mgr *manager) { } } } + + cleanupTestHypervisorProcessesByDataDir(t, mgr.paths.DataDir()) +} + +func cleanupTestHypervisorProcessesByDataDir(t *testing.T, dataDir string) { + if dataDir == "" { + return + } + + entries, err := os.ReadDir("/proc") + if err != nil { + return + } + + for _, entry := range entries { + pid, err := strconv.Atoi(entry.Name()) + if err != nil || pid <= 0 { + continue + } + + cmdlineBytes, err := os.ReadFile(filepath.Join("/proc", entry.Name(), "cmdline")) + if err != nil || !bytes.Contains(cmdlineBytes, []byte(dataDir)) { + continue + } + + cmdline := strings.ReplaceAll(string(cmdlineBytes), "\x00", " ") + if !strings.Contains(cmdline, "cloud-hypervisor") && + !strings.Contains(cmdline, "firecracker") && + !strings.Contains(cmdline, "hypeman-uffd-pager") { + continue + } + + if err := syscall.Kill(pid, 0); err != nil { + continue + } + + t.Logf("Cleaning up test hypervisor helper process: PID %d (%s)", pid, cmdline) + _ = syscall.Kill(pid, syscall.SIGKILL) + WaitForProcessExit(pid, 1*time.Second) + } } func TestBasicEndToEnd(t *testing.T) { diff --git a/lib/instances/qemu_test.go b/lib/instances/qemu_test.go index 97e7d0eb..0a9507e2 100644 --- a/lib/instances/qemu_test.go +++ b/lib/instances/qemu_test.go @@ -69,6 +69,7 @@ func setupTestManagerForQEMU(t *testing.T) (*manager, string) { // Register cleanup to kill any orphaned QEMU processes t.Cleanup(func() { cleanupOrphanedQEMUProcesses(t, mgr) + cleanupTestHypervisorProcessesByDataDir(t, mgr.paths.DataDir()) }) return mgr, tmpDir From eebefcbaf48636dbbe4556df0dad690be9e34d01 Mon Sep 17 00:00:00 2001 From: Hiro Tamada Date: Tue, 9 Jun 2026 16:22:04 -0400 Subject: [PATCH 2/3] Shorten Linux CI temp paths Keep the run-scoped cleanup root short enough for Firecracker and Cloud Hypervisor Unix socket path limits. Co-authored-by: Cursor --- .github/workflows/test.yml | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index a510c5f5..2ab5c415 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -80,7 +80,13 @@ jobs: - name: Create run-scoped temp directory run: | TEST_NETWORK_TMPDIR="/tmp/hm-net-${{ github.run_id }}-${{ github.run_attempt }}" - TEST_TMPDIR="/tmp/hypeman-ci-${{ github.run_id }}-${{ github.run_attempt }}" + TEST_TMPDIR="/hci${{ github.run_attempt }}" + stale_pids="$(ps -eo pid=,ppid=,cmd= | awk -v tmpdir="$TEST_TMPDIR" '$2 == 1 && index($0, tmpdir) && /(firecracker|cloud-hypervisor|hypeman-uffd-pager)/ {print $1}' || true)" + if [ -n "$stale_pids" ]; then + echo "$stale_pids" | xargs -r sudo kill || true + sleep 3 + echo "$stale_pids" | xargs -r sudo kill -9 || true + fi sudo rm -rf "$TEST_NETWORK_TMPDIR" sudo rm -rf "$TEST_TMPDIR" mkdir -p "$TEST_NETWORK_TMPDIR" @@ -163,7 +169,7 @@ jobs: HYPEMAN_TEST_REGISTRY: 127.0.0.1:5001 HYPEMAN_UFFD_PAGER_BINARY: ${{ runner.temp }}/hypeman-uffd-pager-${{ github.run_id }}-${{ github.run_attempt }} HYPEMAN_UFFD_SYSTEMD_INSTANCE_PREFIX: ci-${{ github.run_id }}-${{ github.run_attempt }} - TMPDIR: /tmp/hypeman-ci-${{ github.run_id }}-${{ github.run_attempt }} + TMPDIR: /hci${{ github.run_attempt }} run: | cp "$PWD/bin/hypeman-uffd-pager" "$HYPEMAN_UFFD_PAGER_BINARY" chmod +x "$HYPEMAN_UFFD_PAGER_BINARY" @@ -178,7 +184,7 @@ jobs: echo "$units" | xargs -r sudo systemctl stop || true echo "$units" | xargs -r sudo systemctl reset-failed || true fi - TEST_TMPDIR="/tmp/hypeman-ci-${{ github.run_id }}-${{ github.run_attempt }}" + TEST_TMPDIR="/hci${{ github.run_attempt }}" stale_pids="$(ps -eo pid=,ppid=,cmd= | awk -v tmpdir="$TEST_TMPDIR" '$2 == 1 && index($0, tmpdir) && /(firecracker|cloud-hypervisor|hypeman-uffd-pager)/ {print $1}' || true)" if [ -n "$stale_pids" ]; then echo "$stale_pids" | xargs -r sudo kill || true From 483a0649846dffac149242565661bf1aa3b70766 Mon Sep 17 00:00:00 2001 From: Hiro Tamada Date: Tue, 9 Jun 2026 18:18:45 -0400 Subject: [PATCH 3/3] Use writable short CI temp path Keep the test temp root short for VMM socket limits while placing it under /tmp so the runner can create it. Co-authored-by: Cursor --- .github/workflows/test.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 2ab5c415..d52a161c 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -80,7 +80,7 @@ jobs: - name: Create run-scoped temp directory run: | TEST_NETWORK_TMPDIR="/tmp/hm-net-${{ github.run_id }}-${{ github.run_attempt }}" - TEST_TMPDIR="/hci${{ github.run_attempt }}" + TEST_TMPDIR="/tmp/hci${{ github.run_attempt }}" stale_pids="$(ps -eo pid=,ppid=,cmd= | awk -v tmpdir="$TEST_TMPDIR" '$2 == 1 && index($0, tmpdir) && /(firecracker|cloud-hypervisor|hypeman-uffd-pager)/ {print $1}' || true)" if [ -n "$stale_pids" ]; then echo "$stale_pids" | xargs -r sudo kill || true @@ -169,7 +169,7 @@ jobs: HYPEMAN_TEST_REGISTRY: 127.0.0.1:5001 HYPEMAN_UFFD_PAGER_BINARY: ${{ runner.temp }}/hypeman-uffd-pager-${{ github.run_id }}-${{ github.run_attempt }} HYPEMAN_UFFD_SYSTEMD_INSTANCE_PREFIX: ci-${{ github.run_id }}-${{ github.run_attempt }} - TMPDIR: /hci${{ github.run_attempt }} + TMPDIR: /tmp/hci${{ github.run_attempt }} run: | cp "$PWD/bin/hypeman-uffd-pager" "$HYPEMAN_UFFD_PAGER_BINARY" chmod +x "$HYPEMAN_UFFD_PAGER_BINARY" @@ -184,7 +184,7 @@ jobs: echo "$units" | xargs -r sudo systemctl stop || true echo "$units" | xargs -r sudo systemctl reset-failed || true fi - TEST_TMPDIR="/hci${{ github.run_attempt }}" + TEST_TMPDIR="/tmp/hci${{ github.run_attempt }}" stale_pids="$(ps -eo pid=,ppid=,cmd= | awk -v tmpdir="$TEST_TMPDIR" '$2 == 1 && index($0, tmpdir) && /(firecracker|cloud-hypervisor|hypeman-uffd-pager)/ {print $1}' || true)" if [ -n "$stale_pids" ]; then echo "$stale_pids" | xargs -r sudo kill || true