1
0
forked from wrenn/wrenn

fix: resolve pause/snapshot failures and CoW exhaustion on large VMs

Remove hard 10s timeout from Firecracker HTTP client — callers already
pass context.Context with appropriate deadlines, and 20GB+ memfile
writes easily exceed 10s.

Ensure CoW file is at least as large as the origin rootfs. Previously,
WRENN_DEFAULT_ROOTFS_SIZE=30Gi expanded the base image to 30GB but the
default 5GB CoW could not hold all writes, causing dm-snapshot
invalidation and EIO on all guest I/O.

Destroy frozen VMs in resumeOnError instead of leaving zombies that
report "running" but can't execute. Use fresh context for the resume
attempt so a cancelled caller context doesn't falsely trigger destroy.

Increase CP→Agent ResponseHeaderTimeout from 45s to 5min and
PrepareSnapshot timeout from 3s to 30s for large-memory VMs.

After failed pause, ping agent to detect destroyed sandboxes and mark
DB status as "error" instead of reverting to "running".
This commit is contained in:
2026-05-04 01:46:57 +06:00
parent 1244c08e42
commit 51b5d7b3ba
4 changed files with 48 additions and 15 deletions

View File

@ -186,9 +186,12 @@ func (m *Manager) Create(ctx context.Context, sandboxID string, teamID, template
} }
// Create dm-snapshot with per-sandbox CoW file. // Create dm-snapshot with per-sandbox CoW file.
// CoW must be at least as large as the origin — if every block is
// rewritten, the CoW stores a full copy. Undersized CoW causes
// dm-snapshot invalidation → EIO on all guest I/O.
dmName := "wrenn-" + sandboxID dmName := "wrenn-" + sandboxID
cowPath := filepath.Join(layout.SandboxesDir(m.cfg.WrennDir), fmt.Sprintf("%s.cow", sandboxID)) cowPath := filepath.Join(layout.SandboxesDir(m.cfg.WrennDir), fmt.Sprintf("%s.cow", sandboxID))
cowSize := int64(diskSizeMB) * 1024 * 1024 cowSize := max(int64(diskSizeMB)*1024*1024, originSize)
dmDev, err := devicemapper.CreateSnapshot(dmName, originLoop, cowPath, originSize, cowSize) dmDev, err := devicemapper.CreateSnapshot(dmName, originLoop, cowPath, originSize, cowSize)
if err != nil { if err != nil {
m.loops.Release(baseRootfs) m.loops.Release(baseRootfs)
@ -391,11 +394,13 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error {
slog.Debug("pause: envd client idle connections closed", "id", sandboxID) slog.Debug("pause: envd client idle connections closed", "id", sandboxID)
// Step 0c: Signal envd to quiesce (stop port scanner/forwarder, mark // Step 0c: Signal envd to quiesce (stop port scanner/forwarder, mark
// connections for post-restore cleanup). The 3s timeout also gives time // connections for post-restore cleanup). Also drops page cache which
// for the FINs from Step 0b to be processed by the guest kernel. // can take significant time on large-memory VMs (20GB+). The timeout
// also gives time for the FINs from Step 0b to be processed by the
// guest kernel.
// Best-effort: a failure is logged but does not abort the pause. // Best-effort: a failure is logged but does not abort the pause.
func() { func() {
prepCtx, prepCancel := context.WithTimeout(ctx, 3*time.Second) prepCtx, prepCancel := context.WithTimeout(ctx, 30*time.Second)
defer prepCancel() defer prepCancel()
if err := sb.client.PrepareSnapshot(prepCtx); err != nil { if err := sb.client.PrepareSnapshot(prepCtx); err != nil {
slog.Warn("pause: pre-snapshot quiesce failed (best-effort)", "id", sandboxID, "error", err) slog.Warn("pause: pre-snapshot quiesce failed (best-effort)", "id", sandboxID, "error", err)
@ -423,12 +428,24 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error {
// resumeOnError unpauses the VM so the sandbox stays usable when a // resumeOnError unpauses the VM so the sandbox stays usable when a
// post-freeze step fails. If the resume itself fails, the sandbox is // post-freeze step fails. If the resume itself fails, the sandbox is
// left frozen — the caller should destroy it. It also resets the // frozen and unrecoverable — destroy it to avoid a zombie that reports
// connection tracker so the sandbox can accept proxy connections again. // "running" but can't execute anything.
resumeOnError := func() { resumeOnError := func() {
sb.connTracker.Reset() sb.connTracker.Reset()
if err := m.vm.Resume(ctx, sandboxID); err != nil { // Use a fresh context — the caller's ctx may already be cancelled
slog.Error("failed to resume VM after pause error — sandbox is frozen", "id", sandboxID, "error", err) // (e.g. CP-side ResponseHeaderTimeout fired), which would make the
// resume fail immediately and destroy a perfectly resumable VM.
resumeCtx, resumeCancel := context.WithTimeout(context.Background(), 30*time.Second)
defer resumeCancel()
if err := m.vm.Resume(resumeCtx, sandboxID); err != nil {
slog.Error("failed to resume VM after pause error — destroying frozen sandbox", "id", sandboxID, "error", err)
m.cleanup(context.Background(), sb)
m.mu.Lock()
delete(m.boxes, sandboxID)
m.mu.Unlock()
if m.onDestroy != nil {
m.onDestroy(sandboxID)
}
} }
} }
@ -444,6 +461,7 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error {
snapshotStart := time.Now() snapshotStart := time.Now()
if err := m.vm.Snapshot(ctx, sandboxID, snapPath, rawMemPath, snapshotType); err != nil { if err := m.vm.Snapshot(ctx, sandboxID, snapPath, rawMemPath, snapshotType); err != nil {
slog.Error("pause: snapshot failed", "id", sandboxID, "type", snapshotType, "elapsed", time.Since(snapshotStart), "error", err)
warnErr("snapshot dir cleanup error", sandboxID, os.RemoveAll(pauseDir)) warnErr("snapshot dir cleanup error", sandboxID, os.RemoveAll(pauseDir))
resumeOnError() resumeOnError()
return fmt.Errorf("create VM snapshot: %w", err) return fmt.Errorf("create VM snapshot: %w", err)
@ -1134,7 +1152,7 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID string, team
dmName := "wrenn-" + sandboxID dmName := "wrenn-" + sandboxID
cowPath := filepath.Join(layout.SandboxesDir(m.cfg.WrennDir), fmt.Sprintf("%s.cow", sandboxID)) cowPath := filepath.Join(layout.SandboxesDir(m.cfg.WrennDir), fmt.Sprintf("%s.cow", sandboxID))
cowSize := int64(diskSizeMB) * 1024 * 1024 cowSize := max(int64(diskSizeMB)*1024*1024, originSize)
dmDev, err := devicemapper.CreateSnapshot(dmName, originLoop, cowPath, originSize, cowSize) dmDev, err := devicemapper.CreateSnapshot(dmName, originLoop, cowPath, originSize, cowSize)
if err != nil { if err != nil {
source.Close() source.Close()

View File

@ -8,7 +8,6 @@ import (
"io" "io"
"net" "net"
"net/http" "net/http"
"time"
) )
// fcClient talks to the Firecracker HTTP API over a Unix socket. // fcClient talks to the Firecracker HTTP API over a Unix socket.
@ -27,7 +26,9 @@ func newFCClient(socketPath string) *fcClient {
return d.DialContext(ctx, "unix", socketPath) return d.DialContext(ctx, "unix", socketPath)
}, },
}, },
Timeout: 10 * time.Second, // No global timeout — callers pass context.Context with appropriate
// deadlines. A fixed 10s timeout was too short for snapshot/resume
// operations on large-memory VMs (20GB+ memfiles).
}, },
} }
} }

View File

@ -47,7 +47,7 @@ func NewHostClientPoolTLS(tlsCfg *tls.Config) *HostClientPool {
TLSNextProto: make(map[string]func(authority string, c *tls.Conn) http.RoundTripper), TLSNextProto: make(map[string]func(authority string, c *tls.Conn) http.RoundTripper),
MaxIdleConnsPerHost: 20, MaxIdleConnsPerHost: 20,
IdleConnTimeout: 90 * time.Second, IdleConnTimeout: 90 * time.Second,
ResponseHeaderTimeout: 45 * time.Second, ResponseHeaderTimeout: 5 * time.Minute,
DialContext: (&net.Dialer{ DialContext: (&net.Dialer{
Timeout: 10 * time.Second, Timeout: 10 * time.Second,
KeepAlive: 30 * time.Second, KeepAlive: 30 * time.Second,

View File

@ -239,12 +239,26 @@ func (s *SandboxService) Pause(ctx context.Context, sandboxID, teamID pgtype.UUI
if _, err := agent.PauseSandbox(ctx, connect.NewRequest(&pb.PauseSandboxRequest{ if _, err := agent.PauseSandbox(ctx, connect.NewRequest(&pb.PauseSandboxRequest{
SandboxId: sandboxIDStr, SandboxId: sandboxIDStr,
})); err != nil { })); err != nil {
// Revert status on failure. // Check if the agent still has this sandbox. If it was destroyed
if _, dbErr := s.DB.UpdateSandboxStatus(ctx, db.UpdateSandboxStatusParams{ // (e.g. frozen VM couldn't be resumed), mark as "error" instead of
ID: sandboxID, Status: "running", // reverting to "running" — which would create a ghost record.
// Use a fresh context since the original ctx may already be expired.
revertStatus := "running"
pingCtx, pingCancel := context.WithTimeout(context.Background(), 10*time.Second)
if _, pingErr := agent.PingSandbox(pingCtx, connect.NewRequest(&pb.PingSandboxRequest{
SandboxId: sandboxIDStr,
})); pingErr != nil {
revertStatus = "error"
slog.Warn("sandbox gone from agent after failed pause, marking as error", "sandbox_id", sandboxIDStr)
}
pingCancel()
dbCtx, dbCancel := context.WithTimeout(context.Background(), 5*time.Second)
if _, dbErr := s.DB.UpdateSandboxStatus(dbCtx, db.UpdateSandboxStatusParams{
ID: sandboxID, Status: revertStatus,
}); dbErr != nil { }); dbErr != nil {
slog.Warn("failed to revert sandbox status after pause error", "sandbox_id", sandboxIDStr, "error", dbErr) slog.Warn("failed to revert sandbox status after pause error", "sandbox_id", sandboxIDStr, "error", dbErr)
} }
dbCancel()
return db.Sandbox{}, fmt.Errorf("agent pause: %w", err) return db.Sandbox{}, fmt.Errorf("agent pause: %w", err)
} }