diff --git a/CLAUDE.md b/CLAUDE.md index cdde31cf..5c3b64d7 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -78,6 +78,10 @@ Each `[[containers]]` block may set an optional `image` to override the default - The **host publish IP** for *all* published ports (gateway ports + the 4510-4559 service range) is the host part of the first entry, defaulting to `127.0.0.1`. So `GATEWAY_LISTEN = "0.0.0.0:4566,0.0.0.0:443"` exposes the emulator beyond loopback (e.g. on an EC2/MicroVM host). This is threaded through as `runtime.ContainerConfig.BindHost` and applied in `internal/runtime/docker.go`. - Gateway ports beyond the primary edge port (4566, which is published on the configured `port`) are published host-port == container-port, so listing an extra port like `:8443` publishes it. `servicePortRange()` covers only 4510-4559 now — 443 comes from the default `GATEWAY_LISTEN`. +## Startup timeout + +`lstk start --timeout` bounds how long lstk waits for the emulator to report healthy (`awaitStartup` in `internal/container/start.go`), so a container that never comes up fails fast with a clear error and non-zero exit instead of hanging (e.g. in CI). It defaults to `defaultStartupTimeout` (5m, defined in `cmd/start.go`); `--timeout 0` disables it (wait indefinitely). The value is threaded as `StartOptions.StartupTimeout` and applied by wrapping the wait context with a deadline; a deadline hit is surfaced as `startupTimeoutError` (telemetry `ErrCodeStartTimeout`). `restart` and the snapshot auto-start path reuse the same default but do not expose the flag. + ## Volume Mounts Each `[[containers]]` block accepts a `volumes` list of Docker-style `"host:container[:ro]"` bind specs (e.g. for Snowflake init hooks mounted into `/etc/localstack/init/{boot,start,ready,shutdown}.d`). The persistence/cache mount to `/var/lib/localstack` is folded into this list: the entry whose container target is `/var/lib/localstack` (`persistenceTarget` in `internal/config/containers.go`) defines the host dir backing it, and that path is what `VolumeDir()`, `lstk volume path`, and `lstk volume clear` resolve. Resolution precedence in `VolumeDir()`: a `volumes` entry targeting `/var/lib/localstack` → the legacy singular `volume = "..."` field (still honored for backward compatibility) → the default OS cache dir. Setting the persistence dir via both `volume` and a `volumes` entry with differing sources is a validation error. diff --git a/cmd/restart.go b/cmd/restart.go index 754d9425..60a43fe2 100644 --- a/cmd/restart.go +++ b/cmd/restart.go @@ -40,7 +40,7 @@ func newRestartCmd(cfg *env.Env, tel *telemetry.Client, logger log.Logger) *cobr stopOpts := container.StopOptions{ Telemetry: tel, } - startOpts := buildStartOptions(cfg, appConfig, logger, tel, persist) + startOpts := buildStartOptions(cfg, appConfig, logger, tel, persist, defaultStartupTimeout) if isInteractiveMode(cfg) { return ui.RunRestart(cmd.Context(), rt, stopOpts, startOpts) diff --git a/cmd/root.go b/cmd/root.go index 0c156803..82d81d6e 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -64,15 +64,11 @@ func NewRootCmd(cfg *env.Env, tel *telemetry.Client, logger log.Logger) *cobra.C if err != nil { return err } - persist, err := cmd.Flags().GetBool("persist") + persist, timeout, snapshotFlag, noSnapshot, err := startFlags(cmd) if err != nil { return err } - snapshotFlag, noSnapshot, err := snapshotFlags(cmd) - if err != nil { - return err - } - return startEmulator(cmd.Context(), rt, cfg, tel, logger, persist, firstRun, snapshotFlag, noSnapshot) + return startEmulator(cmd.Context(), rt, cfg, tel, logger, persist, timeout, firstRun, snapshotFlag, noSnapshot) }, } @@ -82,8 +78,7 @@ func NewRootCmd(cfg *env.Env, tel *telemetry.Client, logger log.Logger) *cobra.C root.PersistentFlags().String("config", "", "Path to config file") root.PersistentFlags().BoolVar(&cfg.NonInteractive, "non-interactive", false, "Disable interactive mode") - root.Flags().Bool("persist", false, "Persist emulator state across restarts") - addSnapshotStartFlags(root) + addStartFlags(root) // Parse lstk's global flags only when they precede the command name: with // interspersing disabled, Cobra consumes leading flags and hands everything @@ -234,7 +229,7 @@ func Execute(ctx context.Context) error { return nil } -func buildStartOptions(cfg *env.Env, appConfig *config.Config, logger log.Logger, tel *telemetry.Client, persist bool) container.StartOptions { +func buildStartOptions(cfg *env.Env, appConfig *config.Config, logger log.Logger, tel *telemetry.Client, persist bool, startupTimeout time.Duration) container.StartOptions { return container.StartOptions{ PlatformClient: api.NewPlatformClient(cfg.APIEndpoint, logger), AuthToken: cfg.AuthToken, @@ -244,12 +239,13 @@ func buildStartOptions(cfg *env.Env, appConfig *config.Config, logger log.Logger Containers: appConfig.Containers, Env: appConfig.Env, Persist: persist, + StartupTimeout: startupTimeout, Logger: logger, Telemetry: tel, } } -func startEmulator(ctx context.Context, rt runtime.Runtime, cfg *env.Env, tel *telemetry.Client, logger log.Logger, persist bool, firstRun bool, snapshotFlag string, noSnapshot bool) error { +func startEmulator(ctx context.Context, rt runtime.Runtime, cfg *env.Env, tel *telemetry.Client, logger log.Logger, persist bool, startupTimeout time.Duration, firstRun bool, snapshotFlag string, noSnapshot bool) error { appConfig, err := config.Get() if err != nil { return fmt.Errorf("failed to get config: %w", err) @@ -265,7 +261,7 @@ func startEmulator(ctx context.Context, rt runtime.Runtime, cfg *env.Env, tel *t return err } - opts := buildStartOptions(cfg, appConfig, logger, tel, persist) + opts := buildStartOptions(cfg, appConfig, logger, tel, persist, startupTimeout) notifyOpts := update.NotifyOptions{ GitHubToken: cfg.GitHubToken, diff --git a/cmd/snapshot.go b/cmd/snapshot.go index 7e801205..04479a6d 100644 --- a/cmd/snapshot.go +++ b/cmd/snapshot.go @@ -182,7 +182,7 @@ func newSnapshotAutoLoader(cfg *env.Env, rt runtime.Runtime, appConfig *config.C func buildStarter(cfg *env.Env, rt runtime.Runtime, appConfig *config.Config, logger log.Logger, tel *telemetry.Client) snapshot.Starter { return func(ctx context.Context, sink output.Sink) error { - opts := buildStartOptions(cfg, appConfig, logger, tel, false) + opts := buildStartOptions(cfg, appConfig, logger, tel, false, defaultStartupTimeout) _, err := container.Start(ctx, rt, sink, opts, false) return err } diff --git a/cmd/start.go b/cmd/start.go index aa60bb4a..69a1aac2 100644 --- a/cmd/start.go +++ b/cmd/start.go @@ -1,6 +1,8 @@ package cmd import ( + "time" + "github.com/localstack/lstk/internal/env" "github.com/localstack/lstk/internal/log" "github.com/localstack/lstk/internal/runtime" @@ -8,6 +10,30 @@ import ( "github.com/spf13/cobra" ) +// defaultStartupTimeout bounds how long `lstk start` waits for the emulator to +// become healthy before failing. `--timeout 0` disables it. +const defaultStartupTimeout = 5 * time.Minute + +// addStartFlags registers the flags shared by the `start` command and the root +// command (which starts the emulator when invoked without a subcommand). +func addStartFlags(cmd *cobra.Command) { + cmd.Flags().Bool("persist", false, "Persist emulator state across restarts") + cmd.Flags().Duration("timeout", defaultStartupTimeout, "Maximum time to wait for the emulator to become ready (0 disables the timeout)") + addSnapshotStartFlags(cmd) +} + +// startFlags parses the flags shared by the `start` and root commands. +func startFlags(cmd *cobra.Command) (persist bool, timeout time.Duration, snapshotFlag string, noSnapshot bool, err error) { + if persist, err = cmd.Flags().GetBool("persist"); err != nil { + return + } + if timeout, err = cmd.Flags().GetDuration("timeout"); err != nil { + return + } + snapshotFlag, noSnapshot, err = snapshotFlags(cmd) + return +} + func newStartCmd(cfg *env.Env, tel *telemetry.Client, logger log.Logger) *cobra.Command { var firstRun bool cmd := &cobra.Command{ @@ -24,18 +50,13 @@ If a snapshot is configured for the AWS emulator (the snapshot field in [[contai if err != nil { return err } - persist, err := c.Flags().GetBool("persist") + persist, timeout, snapshotFlag, noSnapshot, err := startFlags(c) if err != nil { return err } - snapshotFlag, noSnapshot, err := snapshotFlags(c) - if err != nil { - return err - } - return startEmulator(c.Context(), rt, cfg, tel, logger, persist, firstRun, snapshotFlag, noSnapshot) + return startEmulator(c.Context(), rt, cfg, tel, logger, persist, timeout, firstRun, snapshotFlag, noSnapshot) }, } - cmd.Flags().Bool("persist", false, "Persist emulator state across restarts") - addSnapshotStartFlags(cmd) + addStartFlags(cmd) return cmd } diff --git a/internal/container/start.go b/internal/container/start.go index 1242f9b4..488467fc 100644 --- a/internal/container/start.go +++ b/internal/container/start.go @@ -44,8 +44,11 @@ type StartOptions struct { Containers []config.ContainerConfig Env map[string]map[string]string Persist bool - Logger log.Logger - Telemetry *telemetry.Client + // StartupTimeout bounds how long to wait for the emulator to become + // healthy. A value <= 0 disables the timeout (wait indefinitely). + StartupTimeout time.Duration + Logger log.Logger + Telemetry *telemetry.Client } func Start(ctx context.Context, rt runtime.Runtime, sink output.Sink, opts StartOptions, interactive bool) (string, error) { @@ -239,7 +242,7 @@ func Start(ctx context.Context, rt runtime.Runtime, sink output.Sink, opts Start } } - if err := startContainers(ctx, rt, sink, tel, containers, pulled); err != nil { + if err := startContainers(ctx, rt, sink, tel, containers, pulled, opts.StartupTimeout); err != nil { return "", err } @@ -504,7 +507,7 @@ func validateLicensesFromImages(ctx context.Context, rt runtime.Runtime, sink ou return firstVersion, nil } -func startContainers(ctx context.Context, rt runtime.Runtime, sink output.Sink, tel *telemetry.Client, containers []runtime.ContainerConfig, pulled map[string]bool) error { +func startContainers(ctx context.Context, rt runtime.Runtime, sink output.Sink, tel *telemetry.Client, containers []runtime.ContainerConfig, pulled map[string]bool, startupTimeout time.Duration) error { for _, c := range containers { startTime := time.Now() sink.Emit(output.SpinnerStart("Starting LocalStack")) @@ -522,11 +525,13 @@ func startContainers(ctx context.Context, rt runtime.Runtime, sink output.Sink, } healthURL := fmt.Sprintf("http://localhost:%s%s", c.Port, c.HealthPath) - if err := awaitStartup(ctx, rt, sink, containerID, "LocalStack", healthURL); err != nil { + if err := awaitStartup(ctx, rt, sink, containerID, "LocalStack", healthURL, startupTimeout); err != nil { sink.Emit(output.SpinnerStop()) errCode := telemetry.ErrCodeStartFailed var licErr *licenseNotCoveredError - if errors.As(err, &licErr) && c.EmulatorType.SelfValidatesLicense() { + var timeoutErr *startupTimeoutError + switch { + case errors.As(err, &licErr) && c.EmulatorType.SelfValidatesLicense(): errCode = telemetry.ErrCodeLicenseInvalid sink.Emit(output.ErrorEvent{ Title: fmt.Sprintf("Your license does not include the %s emulator.", c.EmulatorType.ShortName()), @@ -536,6 +541,17 @@ func startContainers(ctx context.Context, rt runtime.Runtime, sink output.Sink, }, }) err = output.NewSilentError(err) + case errors.As(err, &timeoutErr): + errCode = telemetry.ErrCodeStartTimeout + sink.Emit(output.ErrorEvent{ + Title: fmt.Sprintf("LocalStack did not become ready within %s.", timeoutErr.timeout), + Summary: "The emulator is still running but did not report healthy in time.", + Actions: []output.ErrorAction{ + {Label: "Inspect the emulator logs:", Value: "lstk logs"}, + {Label: "Raise or disable the timeout:", Value: "lstk start --timeout 0"}, + }, + }) + err = output.NewSilentError(err) } tel.EmitEmulatorLifecycleEvent(ctx, telemetry.LifecycleEvent{ EventType: telemetry.LifecycleStartError, @@ -775,12 +791,29 @@ func (e *licenseNotCoveredError) Error() string { return "license does not include this emulator" } +// startupTimeoutError is returned by awaitStartup when the emulator does not +// become healthy within the configured startup timeout. +type startupTimeoutError struct { + name string + timeout time.Duration +} + +func (e *startupTimeoutError) Error() string { + return fmt.Sprintf("%s did not become ready within %s", e.name, e.timeout) +} + // awaitStartup polls until one of two outcomes: // - Success: health endpoint returns 200 (license is valid, LocalStack is ready) // - Failure: container stops running (e.g., license activation failed), returns error with container logs // // TODO: move to Runtime interface if other runtimes (k8s?) need native readiness probes -func awaitStartup(ctx context.Context, rt runtime.Runtime, sink output.Sink, containerID, name, healthURL string) error { +func awaitStartup(ctx context.Context, rt runtime.Runtime, sink output.Sink, containerID, name, healthURL string, timeout time.Duration) error { + if timeout > 0 { + var cancel context.CancelFunc + ctx, cancel = context.WithTimeout(ctx, timeout) + defer cancel() + } + client := &http.Client{Timeout: 2 * time.Second} for { @@ -814,6 +847,9 @@ func awaitStartup(ctx context.Context, rt runtime.Runtime, sink output.Sink, con select { case <-ctx.Done(): + if timeout > 0 && errors.Is(ctx.Err(), context.DeadlineExceeded) { + return &startupTimeoutError{name: name, timeout: timeout} + } return ctx.Err() case <-time.After(1 * time.Second): } diff --git a/internal/container/start_test.go b/internal/container/start_test.go index bedddb10..2ade6a43 100644 --- a/internal/container/start_test.go +++ b/internal/container/start_test.go @@ -14,6 +14,7 @@ import ( "sync" "sync/atomic" "testing" + "time" "github.com/localstack/lstk/internal/api" "github.com/localstack/lstk/internal/caller" @@ -561,7 +562,7 @@ func TestStartContainers_SnowflakeLicenseError(t *testing.T) { var out bytes.Buffer sink := output.NewPlainSink(&out) - err := startContainers(context.Background(), mockRT, sink, tel, []runtime.ContainerConfig{c}, map[string]bool{}) + err := startContainers(context.Background(), mockRT, sink, tel, []runtime.ContainerConfig{c}, map[string]bool{}, 0) tel.Close() require.Error(t, err) @@ -607,7 +608,7 @@ func TestStartContainers_AzureLicenseError(t *testing.T) { var out bytes.Buffer sink := output.NewPlainSink(&out) - err := startContainers(context.Background(), mockRT, sink, tel, []runtime.ContainerConfig{c}, map[string]bool{}) + err := startContainers(context.Background(), mockRT, sink, tel, []runtime.ContainerConfig{c}, map[string]bool{}, 0) tel.Close() require.Error(t, err) @@ -629,6 +630,51 @@ func TestStartContainers_AzureLicenseError(t *testing.T) { } } +func TestStartContainers_StartupTimeout(t *testing.T) { + ctrl := gomock.NewController(t) + mockRT := runtime.NewMockRuntime(ctrl) + + c := runtime.ContainerConfig{ + Image: "localstack/localstack-pro:latest", + Name: "localstack-aws", + EmulatorType: config.EmulatorAWS, + Tag: "latest", + Port: "59999", // nothing listens here, so the health poll never succeeds + ContainerPort: "4566/tcp", + HealthPath: "/_localstack/health", + } + const containerID = "abc123" + mockRT.EXPECT().Start(gomock.Any(), c).Return(containerID, nil) + // Container stays up but never becomes healthy, so awaitStartup loops until the timeout. + mockRT.EXPECT().IsRunning(gomock.Any(), containerID).Return(true, nil).AnyTimes() + + tel, capturedEvents := newCapturingTelClient(t) + + var out bytes.Buffer + sink := output.NewPlainSink(&out) + + err := startContainers(context.Background(), mockRT, sink, tel, []runtime.ContainerConfig{c}, map[string]bool{}, 50*time.Millisecond) + tel.Close() + + require.Error(t, err) + assert.True(t, output.IsSilent(err), "error should be silent since ErrorEvent was already emitted") + var timeoutErr *startupTimeoutError + assert.ErrorAs(t, err, &timeoutErr) + got := out.String() + assert.Contains(t, got, "LocalStack did not become ready within 50ms.") + assert.Contains(t, got, "lstk start --timeout 0") + + select { + case ev := <-capturedEvents: + payload, ok := ev["payload"].(map[string]any) + require.True(t, ok, "telemetry event should have a payload map") + assert.Equal(t, telemetry.LifecycleStartError, payload["event_type"]) + assert.Equal(t, telemetry.ErrCodeStartTimeout, payload["error_code"]) + default: + t.Fatal("no telemetry event received") + } +} + func TestPullImages_ReusesLocalImageWhenPresent(t *testing.T) { ctrl := gomock.NewController(t) mockRT := runtime.NewMockRuntime(ctrl) diff --git a/internal/telemetry/events.go b/internal/telemetry/events.go index 20b1d6a3..09888bfb 100644 --- a/internal/telemetry/events.go +++ b/internal/telemetry/events.go @@ -78,6 +78,7 @@ const ( ErrCodeImagePullFailed = "image_pull_failed" ErrCodeLicenseInvalid = "license_invalid" ErrCodeStartFailed = "start_failed" + ErrCodeStartTimeout = "start_timeout" ErrCodeEmulatorMismatch = "emulator_mismatch" ) diff --git a/test/integration/main_test.go b/test/integration/main_test.go index 46ec3297..5fe0647e 100644 --- a/test/integration/main_test.go +++ b/test/integration/main_test.go @@ -234,6 +234,40 @@ func startExternalContainer(t *testing.T, ctx context.Context, imgName, name, ho }) } +// commitNeverHealthyImage builds a local-only image whose default command stays +// running (sleep infinity) but never serves /_localstack/health. Starting it via +// lstk exercises the failure path where the emulator comes up but never reports +// healthy. Returns the image reference; the image and its source container are +// removed on test cleanup. +func commitNeverHealthyImage(t *testing.T, ctx context.Context) string { + t.Helper() + + reader, err := dockerClient.ImagePull(ctx, testImage, client.ImagePullOptions{}) + require.NoError(t, err, "failed to pull test image") + _, _ = io.Copy(io.Discard, reader) + _ = reader.Close() + + resp, err := dockerClient.ContainerCreate(ctx, client.ContainerCreateOptions{ + Config: &container.Config{Image: testImage}, + Name: "lstk-never-healthy-src", + }) + require.NoError(t, err, "failed to create source container") + t.Cleanup(func() { + _, _ = dockerClient.ContainerRemove(context.Background(), resp.ID, client.ContainerRemoveOptions{Force: true}) + }) + + const imageRef = "lstk-never-healthy:latest" + _, err = dockerClient.ContainerCommit(ctx, resp.ID, client.ContainerCommitOptions{ + Reference: imageRef, + Changes: []string{`CMD ["sleep", "infinity"]`}, + }) + require.NoError(t, err, "failed to commit never-healthy image") + t.Cleanup(func() { + _, _ = dockerClient.ImageRemove(context.Background(), imageRef, client.ImageRemoveOptions{Force: true}) + }) + return imageRef +} + func startTestSnowflakeContainer(t *testing.T, ctx context.Context) { t.Helper() startNamedTestContainer(t, ctx, snowflakeContainerName, "snowflake") diff --git a/test/integration/start_test.go b/test/integration/start_test.go index c4056399..fa3bde27 100644 --- a/test/integration/start_test.go +++ b/test/integration/start_test.go @@ -882,6 +882,33 @@ image = "lstk-nonexistent-custom-image" assert.Contains(t, combined, "Failed to pull lstk-nonexistent-custom-image:latest") } +// TestStartTimesOutWhenEmulatorNeverBecomesHealthy verifies that --timeout bounds +// the health-check wait (PRO-357): a container that stays running but never serves +// /_localstack/health must fail fast with a clear error and a non-zero exit, +// instead of hanging indefinitely. +func TestStartTimesOutWhenEmulatorNeverBecomesHealthy(t *testing.T) { + requireDocker(t) + cleanup() + t.Cleanup(cleanup) + + ctx := testContext(t) + imageRef := commitNeverHealthyImage(t, ctx) + + home := t.TempDir() + configFile := filepath.Join(home, "config.toml") + require.NoError(t, os.WriteFile(configFile, + []byte(fmt.Sprintf("[[containers]]\ntype = \"aws\"\nport = \"4566\"\nimage = %q\n", imageRef)), 0644)) + + // The local image is reused (pull fails, ImageExists true), so the license + // pre-flight is skipped and a dummy token is enough to reach the health wait. + e := append(testEnvWithHome(home, ""), string(env.AuthToken)+"=fake-token") + stdout, stderr, err := runLstk(t, ctx, "", e, "--config", configFile, "--non-interactive", "start", "--timeout", "3s") + + require.Error(t, err, "expected start to fail when the emulator never becomes healthy") + requireExitCode(t, 1, err) + assert.Contains(t, stdout+stderr, "LocalStack did not become ready within 3s") +} + // TestStartFallsBackToLocalImageWhenPullFails verifies the offline degradation // path for image pulls: when the configured image cannot be pulled (registry // unreachable, or the image was never published) but is already present locally,