forked from wrenn/wrenn
fix: resolve pause/snapshot failures and CoW exhaustion on large VMs
Remove hard 10s timeout from Firecracker HTTP client — callers already pass context.Context with appropriate deadlines, and 20GB+ memfile writes easily exceed 10s. Ensure CoW file is at least as large as the origin rootfs. Previously, WRENN_DEFAULT_ROOTFS_SIZE=30Gi expanded the base image to 30GB but the default 5GB CoW could not hold all writes, causing dm-snapshot invalidation and EIO on all guest I/O. Destroy frozen VMs in resumeOnError instead of leaving zombies that report "running" but can't execute. Use fresh context for the resume attempt so a cancelled caller context doesn't falsely trigger destroy. Increase CP→Agent ResponseHeaderTimeout from 45s to 5min and PrepareSnapshot timeout from 3s to 30s for large-memory VMs. After failed pause, ping agent to detect destroyed sandboxes and mark DB status as "error" instead of reverting to "running".
This commit is contained in:
@ -186,9 +186,12 @@ func (m *Manager) Create(ctx context.Context, sandboxID string, teamID, template
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Create dm-snapshot with per-sandbox CoW file.
|
// Create dm-snapshot with per-sandbox CoW file.
|
||||||
|
// CoW must be at least as large as the origin — if every block is
|
||||||
|
// rewritten, the CoW stores a full copy. Undersized CoW causes
|
||||||
|
// dm-snapshot invalidation → EIO on all guest I/O.
|
||||||
dmName := "wrenn-" + sandboxID
|
dmName := "wrenn-" + sandboxID
|
||||||
cowPath := filepath.Join(layout.SandboxesDir(m.cfg.WrennDir), fmt.Sprintf("%s.cow", sandboxID))
|
cowPath := filepath.Join(layout.SandboxesDir(m.cfg.WrennDir), fmt.Sprintf("%s.cow", sandboxID))
|
||||||
cowSize := int64(diskSizeMB) * 1024 * 1024
|
cowSize := max(int64(diskSizeMB)*1024*1024, originSize)
|
||||||
dmDev, err := devicemapper.CreateSnapshot(dmName, originLoop, cowPath, originSize, cowSize)
|
dmDev, err := devicemapper.CreateSnapshot(dmName, originLoop, cowPath, originSize, cowSize)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
m.loops.Release(baseRootfs)
|
m.loops.Release(baseRootfs)
|
||||||
@ -391,11 +394,13 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error {
|
|||||||
slog.Debug("pause: envd client idle connections closed", "id", sandboxID)
|
slog.Debug("pause: envd client idle connections closed", "id", sandboxID)
|
||||||
|
|
||||||
// Step 0c: Signal envd to quiesce (stop port scanner/forwarder, mark
|
// Step 0c: Signal envd to quiesce (stop port scanner/forwarder, mark
|
||||||
// connections for post-restore cleanup). The 3s timeout also gives time
|
// connections for post-restore cleanup). Also drops page cache which
|
||||||
// for the FINs from Step 0b to be processed by the guest kernel.
|
// can take significant time on large-memory VMs (20GB+). The timeout
|
||||||
|
// also gives time for the FINs from Step 0b to be processed by the
|
||||||
|
// guest kernel.
|
||||||
// Best-effort: a failure is logged but does not abort the pause.
|
// Best-effort: a failure is logged but does not abort the pause.
|
||||||
func() {
|
func() {
|
||||||
prepCtx, prepCancel := context.WithTimeout(ctx, 3*time.Second)
|
prepCtx, prepCancel := context.WithTimeout(ctx, 30*time.Second)
|
||||||
defer prepCancel()
|
defer prepCancel()
|
||||||
if err := sb.client.PrepareSnapshot(prepCtx); err != nil {
|
if err := sb.client.PrepareSnapshot(prepCtx); err != nil {
|
||||||
slog.Warn("pause: pre-snapshot quiesce failed (best-effort)", "id", sandboxID, "error", err)
|
slog.Warn("pause: pre-snapshot quiesce failed (best-effort)", "id", sandboxID, "error", err)
|
||||||
@ -423,12 +428,24 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error {
|
|||||||
|
|
||||||
// resumeOnError unpauses the VM so the sandbox stays usable when a
|
// resumeOnError unpauses the VM so the sandbox stays usable when a
|
||||||
// post-freeze step fails. If the resume itself fails, the sandbox is
|
// post-freeze step fails. If the resume itself fails, the sandbox is
|
||||||
// left frozen — the caller should destroy it. It also resets the
|
// frozen and unrecoverable — destroy it to avoid a zombie that reports
|
||||||
// connection tracker so the sandbox can accept proxy connections again.
|
// "running" but can't execute anything.
|
||||||
resumeOnError := func() {
|
resumeOnError := func() {
|
||||||
sb.connTracker.Reset()
|
sb.connTracker.Reset()
|
||||||
if err := m.vm.Resume(ctx, sandboxID); err != nil {
|
// Use a fresh context — the caller's ctx may already be cancelled
|
||||||
slog.Error("failed to resume VM after pause error — sandbox is frozen", "id", sandboxID, "error", err)
|
// (e.g. CP-side ResponseHeaderTimeout fired), which would make the
|
||||||
|
// resume fail immediately and destroy a perfectly resumable VM.
|
||||||
|
resumeCtx, resumeCancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||||
|
defer resumeCancel()
|
||||||
|
if err := m.vm.Resume(resumeCtx, sandboxID); err != nil {
|
||||||
|
slog.Error("failed to resume VM after pause error — destroying frozen sandbox", "id", sandboxID, "error", err)
|
||||||
|
m.cleanup(context.Background(), sb)
|
||||||
|
m.mu.Lock()
|
||||||
|
delete(m.boxes, sandboxID)
|
||||||
|
m.mu.Unlock()
|
||||||
|
if m.onDestroy != nil {
|
||||||
|
m.onDestroy(sandboxID)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -444,6 +461,7 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error {
|
|||||||
|
|
||||||
snapshotStart := time.Now()
|
snapshotStart := time.Now()
|
||||||
if err := m.vm.Snapshot(ctx, sandboxID, snapPath, rawMemPath, snapshotType); err != nil {
|
if err := m.vm.Snapshot(ctx, sandboxID, snapPath, rawMemPath, snapshotType); err != nil {
|
||||||
|
slog.Error("pause: snapshot failed", "id", sandboxID, "type", snapshotType, "elapsed", time.Since(snapshotStart), "error", err)
|
||||||
warnErr("snapshot dir cleanup error", sandboxID, os.RemoveAll(pauseDir))
|
warnErr("snapshot dir cleanup error", sandboxID, os.RemoveAll(pauseDir))
|
||||||
resumeOnError()
|
resumeOnError()
|
||||||
return fmt.Errorf("create VM snapshot: %w", err)
|
return fmt.Errorf("create VM snapshot: %w", err)
|
||||||
@ -1134,7 +1152,7 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID string, team
|
|||||||
|
|
||||||
dmName := "wrenn-" + sandboxID
|
dmName := "wrenn-" + sandboxID
|
||||||
cowPath := filepath.Join(layout.SandboxesDir(m.cfg.WrennDir), fmt.Sprintf("%s.cow", sandboxID))
|
cowPath := filepath.Join(layout.SandboxesDir(m.cfg.WrennDir), fmt.Sprintf("%s.cow", sandboxID))
|
||||||
cowSize := int64(diskSizeMB) * 1024 * 1024
|
cowSize := max(int64(diskSizeMB)*1024*1024, originSize)
|
||||||
dmDev, err := devicemapper.CreateSnapshot(dmName, originLoop, cowPath, originSize, cowSize)
|
dmDev, err := devicemapper.CreateSnapshot(dmName, originLoop, cowPath, originSize, cowSize)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
source.Close()
|
source.Close()
|
||||||
|
|||||||
@ -8,7 +8,6 @@ import (
|
|||||||
"io"
|
"io"
|
||||||
"net"
|
"net"
|
||||||
"net/http"
|
"net/http"
|
||||||
"time"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
// fcClient talks to the Firecracker HTTP API over a Unix socket.
|
// fcClient talks to the Firecracker HTTP API over a Unix socket.
|
||||||
@ -27,7 +26,9 @@ func newFCClient(socketPath string) *fcClient {
|
|||||||
return d.DialContext(ctx, "unix", socketPath)
|
return d.DialContext(ctx, "unix", socketPath)
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
Timeout: 10 * time.Second,
|
// No global timeout — callers pass context.Context with appropriate
|
||||||
|
// deadlines. A fixed 10s timeout was too short for snapshot/resume
|
||||||
|
// operations on large-memory VMs (20GB+ memfiles).
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -47,7 +47,7 @@ func NewHostClientPoolTLS(tlsCfg *tls.Config) *HostClientPool {
|
|||||||
TLSNextProto: make(map[string]func(authority string, c *tls.Conn) http.RoundTripper),
|
TLSNextProto: make(map[string]func(authority string, c *tls.Conn) http.RoundTripper),
|
||||||
MaxIdleConnsPerHost: 20,
|
MaxIdleConnsPerHost: 20,
|
||||||
IdleConnTimeout: 90 * time.Second,
|
IdleConnTimeout: 90 * time.Second,
|
||||||
ResponseHeaderTimeout: 45 * time.Second,
|
ResponseHeaderTimeout: 5 * time.Minute,
|
||||||
DialContext: (&net.Dialer{
|
DialContext: (&net.Dialer{
|
||||||
Timeout: 10 * time.Second,
|
Timeout: 10 * time.Second,
|
||||||
KeepAlive: 30 * time.Second,
|
KeepAlive: 30 * time.Second,
|
||||||
|
|||||||
@ -239,12 +239,26 @@ func (s *SandboxService) Pause(ctx context.Context, sandboxID, teamID pgtype.UUI
|
|||||||
if _, err := agent.PauseSandbox(ctx, connect.NewRequest(&pb.PauseSandboxRequest{
|
if _, err := agent.PauseSandbox(ctx, connect.NewRequest(&pb.PauseSandboxRequest{
|
||||||
SandboxId: sandboxIDStr,
|
SandboxId: sandboxIDStr,
|
||||||
})); err != nil {
|
})); err != nil {
|
||||||
// Revert status on failure.
|
// Check if the agent still has this sandbox. If it was destroyed
|
||||||
if _, dbErr := s.DB.UpdateSandboxStatus(ctx, db.UpdateSandboxStatusParams{
|
// (e.g. frozen VM couldn't be resumed), mark as "error" instead of
|
||||||
ID: sandboxID, Status: "running",
|
// reverting to "running" — which would create a ghost record.
|
||||||
|
// Use a fresh context since the original ctx may already be expired.
|
||||||
|
revertStatus := "running"
|
||||||
|
pingCtx, pingCancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||||
|
if _, pingErr := agent.PingSandbox(pingCtx, connect.NewRequest(&pb.PingSandboxRequest{
|
||||||
|
SandboxId: sandboxIDStr,
|
||||||
|
})); pingErr != nil {
|
||||||
|
revertStatus = "error"
|
||||||
|
slog.Warn("sandbox gone from agent after failed pause, marking as error", "sandbox_id", sandboxIDStr)
|
||||||
|
}
|
||||||
|
pingCancel()
|
||||||
|
dbCtx, dbCancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||||
|
if _, dbErr := s.DB.UpdateSandboxStatus(dbCtx, db.UpdateSandboxStatusParams{
|
||||||
|
ID: sandboxID, Status: revertStatus,
|
||||||
}); dbErr != nil {
|
}); dbErr != nil {
|
||||||
slog.Warn("failed to revert sandbox status after pause error", "sandbox_id", sandboxIDStr, "error", dbErr)
|
slog.Warn("failed to revert sandbox status after pause error", "sandbox_id", sandboxIDStr, "error", dbErr)
|
||||||
}
|
}
|
||||||
|
dbCancel()
|
||||||
return db.Sandbox{}, fmt.Errorf("agent pause: %w", err)
|
return db.Sandbox{}, fmt.Errorf("agent pause: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user