From 51b5d7b3ba8fa11382d52ebca67a0c31c5899051 Mon Sep 17 00:00:00 2001 From: pptx704 Date: Mon, 4 May 2026 01:46:57 +0600 Subject: [PATCH] fix: resolve pause/snapshot failures and CoW exhaustion on large VMs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove hard 10s timeout from Firecracker HTTP client — callers already pass context.Context with appropriate deadlines, and 20GB+ memfile writes easily exceed 10s. Ensure CoW file is at least as large as the origin rootfs. Previously, WRENN_DEFAULT_ROOTFS_SIZE=30Gi expanded the base image to 30GB but the default 5GB CoW could not hold all writes, causing dm-snapshot invalidation and EIO on all guest I/O. Destroy frozen VMs in resumeOnError instead of leaving zombies that report "running" but can't execute. Use fresh context for the resume attempt so a cancelled caller context doesn't falsely trigger destroy. Increase CP→Agent ResponseHeaderTimeout from 45s to 5min and PrepareSnapshot timeout from 3s to 30s for large-memory VMs. After failed pause, ping agent to detect destroyed sandboxes and mark DB status as "error" instead of reverting to "running". --- internal/sandbox/manager.go | 36 +++++++++++++++++++++++++++--------- internal/vm/fc.go | 5 +++-- pkg/lifecycle/hostpool.go | 2 +- pkg/service/sandbox.go | 20 +++++++++++++++++--- 4 files changed, 48 insertions(+), 15 deletions(-) diff --git a/internal/sandbox/manager.go b/internal/sandbox/manager.go index f87e0f7..a98f708 100644 --- a/internal/sandbox/manager.go +++ b/internal/sandbox/manager.go @@ -186,9 +186,12 @@ func (m *Manager) Create(ctx context.Context, sandboxID string, teamID, template } // Create dm-snapshot with per-sandbox CoW file. + // CoW must be at least as large as the origin — if every block is + // rewritten, the CoW stores a full copy. Undersized CoW causes + // dm-snapshot invalidation → EIO on all guest I/O. dmName := "wrenn-" + sandboxID cowPath := filepath.Join(layout.SandboxesDir(m.cfg.WrennDir), fmt.Sprintf("%s.cow", sandboxID)) - cowSize := int64(diskSizeMB) * 1024 * 1024 + cowSize := max(int64(diskSizeMB)*1024*1024, originSize) dmDev, err := devicemapper.CreateSnapshot(dmName, originLoop, cowPath, originSize, cowSize) if err != nil { m.loops.Release(baseRootfs) @@ -391,11 +394,13 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error { slog.Debug("pause: envd client idle connections closed", "id", sandboxID) // Step 0c: Signal envd to quiesce (stop port scanner/forwarder, mark - // connections for post-restore cleanup). The 3s timeout also gives time - // for the FINs from Step 0b to be processed by the guest kernel. + // connections for post-restore cleanup). Also drops page cache which + // can take significant time on large-memory VMs (20GB+). The timeout + // also gives time for the FINs from Step 0b to be processed by the + // guest kernel. // Best-effort: a failure is logged but does not abort the pause. func() { - prepCtx, prepCancel := context.WithTimeout(ctx, 3*time.Second) + prepCtx, prepCancel := context.WithTimeout(ctx, 30*time.Second) defer prepCancel() if err := sb.client.PrepareSnapshot(prepCtx); err != nil { slog.Warn("pause: pre-snapshot quiesce failed (best-effort)", "id", sandboxID, "error", err) @@ -423,12 +428,24 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error { // resumeOnError unpauses the VM so the sandbox stays usable when a // post-freeze step fails. If the resume itself fails, the sandbox is - // left frozen — the caller should destroy it. It also resets the - // connection tracker so the sandbox can accept proxy connections again. + // frozen and unrecoverable — destroy it to avoid a zombie that reports + // "running" but can't execute anything. resumeOnError := func() { sb.connTracker.Reset() - if err := m.vm.Resume(ctx, sandboxID); err != nil { - slog.Error("failed to resume VM after pause error — sandbox is frozen", "id", sandboxID, "error", err) + // Use a fresh context — the caller's ctx may already be cancelled + // (e.g. CP-side ResponseHeaderTimeout fired), which would make the + // resume fail immediately and destroy a perfectly resumable VM. + resumeCtx, resumeCancel := context.WithTimeout(context.Background(), 30*time.Second) + defer resumeCancel() + if err := m.vm.Resume(resumeCtx, sandboxID); err != nil { + slog.Error("failed to resume VM after pause error — destroying frozen sandbox", "id", sandboxID, "error", err) + m.cleanup(context.Background(), sb) + m.mu.Lock() + delete(m.boxes, sandboxID) + m.mu.Unlock() + if m.onDestroy != nil { + m.onDestroy(sandboxID) + } } } @@ -444,6 +461,7 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error { snapshotStart := time.Now() if err := m.vm.Snapshot(ctx, sandboxID, snapPath, rawMemPath, snapshotType); err != nil { + slog.Error("pause: snapshot failed", "id", sandboxID, "type", snapshotType, "elapsed", time.Since(snapshotStart), "error", err) warnErr("snapshot dir cleanup error", sandboxID, os.RemoveAll(pauseDir)) resumeOnError() return fmt.Errorf("create VM snapshot: %w", err) @@ -1134,7 +1152,7 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID string, team dmName := "wrenn-" + sandboxID cowPath := filepath.Join(layout.SandboxesDir(m.cfg.WrennDir), fmt.Sprintf("%s.cow", sandboxID)) - cowSize := int64(diskSizeMB) * 1024 * 1024 + cowSize := max(int64(diskSizeMB)*1024*1024, originSize) dmDev, err := devicemapper.CreateSnapshot(dmName, originLoop, cowPath, originSize, cowSize) if err != nil { source.Close() diff --git a/internal/vm/fc.go b/internal/vm/fc.go index e8f1ac3..333fd00 100644 --- a/internal/vm/fc.go +++ b/internal/vm/fc.go @@ -8,7 +8,6 @@ import ( "io" "net" "net/http" - "time" ) // fcClient talks to the Firecracker HTTP API over a Unix socket. @@ -27,7 +26,9 @@ func newFCClient(socketPath string) *fcClient { return d.DialContext(ctx, "unix", socketPath) }, }, - Timeout: 10 * time.Second, + // No global timeout — callers pass context.Context with appropriate + // deadlines. A fixed 10s timeout was too short for snapshot/resume + // operations on large-memory VMs (20GB+ memfiles). }, } } diff --git a/pkg/lifecycle/hostpool.go b/pkg/lifecycle/hostpool.go index 508bb52..a54fa5e 100644 --- a/pkg/lifecycle/hostpool.go +++ b/pkg/lifecycle/hostpool.go @@ -47,7 +47,7 @@ func NewHostClientPoolTLS(tlsCfg *tls.Config) *HostClientPool { TLSNextProto: make(map[string]func(authority string, c *tls.Conn) http.RoundTripper), MaxIdleConnsPerHost: 20, IdleConnTimeout: 90 * time.Second, - ResponseHeaderTimeout: 45 * time.Second, + ResponseHeaderTimeout: 5 * time.Minute, DialContext: (&net.Dialer{ Timeout: 10 * time.Second, KeepAlive: 30 * time.Second, diff --git a/pkg/service/sandbox.go b/pkg/service/sandbox.go index d50520b..aa736da 100644 --- a/pkg/service/sandbox.go +++ b/pkg/service/sandbox.go @@ -239,12 +239,26 @@ func (s *SandboxService) Pause(ctx context.Context, sandboxID, teamID pgtype.UUI if _, err := agent.PauseSandbox(ctx, connect.NewRequest(&pb.PauseSandboxRequest{ SandboxId: sandboxIDStr, })); err != nil { - // Revert status on failure. - if _, dbErr := s.DB.UpdateSandboxStatus(ctx, db.UpdateSandboxStatusParams{ - ID: sandboxID, Status: "running", + // Check if the agent still has this sandbox. If it was destroyed + // (e.g. frozen VM couldn't be resumed), mark as "error" instead of + // reverting to "running" — which would create a ghost record. + // Use a fresh context since the original ctx may already be expired. + revertStatus := "running" + pingCtx, pingCancel := context.WithTimeout(context.Background(), 10*time.Second) + if _, pingErr := agent.PingSandbox(pingCtx, connect.NewRequest(&pb.PingSandboxRequest{ + SandboxId: sandboxIDStr, + })); pingErr != nil { + revertStatus = "error" + slog.Warn("sandbox gone from agent after failed pause, marking as error", "sandbox_id", sandboxIDStr) + } + pingCancel() + dbCtx, dbCancel := context.WithTimeout(context.Background(), 5*time.Second) + if _, dbErr := s.DB.UpdateSandboxStatus(dbCtx, db.UpdateSandboxStatusParams{ + ID: sandboxID, Status: revertStatus, }); dbErr != nil { slog.Warn("failed to revert sandbox status after pause error", "sandbox_id", sandboxIDStr, "error", dbErr) } + dbCancel() return db.Sandbox{}, fmt.Errorf("agent pause: %w", err) }