fix: resolve pause/snapshot failures and CoW exhaustion on large VMs

Remove hard 10s timeout from Firecracker HTTP client — callers already pass context.Context with appropriate deadlines, and 20GB+ memfile writes easily exceed 10s. Ensure CoW file is at least as large as the origin rootfs. Previously, WRENN_DEFAULT_ROOTFS_SIZE=30Gi expanded the base image to 30GB but the default 5GB CoW could not hold all writes, causing dm-snapshot invalidation and EIO on all guest I/O. Destroy frozen VMs in resumeOnError instead of leaving zombies that report "running" but can't execute. Use fresh context for the resume attempt so a cancelled caller context doesn't falsely trigger destroy. Increase CP→Agent ResponseHeaderTimeout from 45s to 5min and PrepareSnapshot timeout from 3s to 30s for large-memory VMs. After failed pause, ping agent to detect destroyed sandboxes and mark DB status as "error" instead of reverting to "running".
2026-05-04 01:46:57 +06:00
parent 1244c08e42
commit 51b5d7b3ba
4 changed files with 48 additions and 15 deletions
--- a/pkg/lifecycle/hostpool.go
+++ b/pkg/lifecycle/hostpool.go
@ -47,7 +47,7 @@ func NewHostClientPoolTLS(tlsCfg *tls.Config) *HostClientPool {
 		TLSNextProto:          make(map[string]func(authority string, c *tls.Conn) http.RoundTripper),
 		MaxIdleConnsPerHost:   20,
 		IdleConnTimeout:       90 * time.Second,
-		ResponseHeaderTimeout: 45 * time.Second,
+		ResponseHeaderTimeout: 5 * time.Minute,
 		DialContext: (&net.Dialer{
 			Timeout:   10 * time.Second,
 			KeepAlive: 30 * time.Second,
--- a/pkg/service/sandbox.go
+++ b/pkg/service/sandbox.go
@ -239,12 +239,26 @@ func (s *SandboxService) Pause(ctx context.Context, sandboxID, teamID pgtype.UUI
 	if _, err := agent.PauseSandbox(ctx, connect.NewRequest(&pb.PauseSandboxRequest{
 		SandboxId: sandboxIDStr,
 	})); err != nil {
-		// Revert status on failure.
-		if _, dbErr := s.DB.UpdateSandboxStatus(ctx, db.UpdateSandboxStatusParams{
-			ID: sandboxID, Status: "running",
+		// Check if the agent still has this sandbox. If it was destroyed
+		// (e.g. frozen VM couldn't be resumed), mark as "error" instead of
+		// reverting to "running" — which would create a ghost record.
+		// Use a fresh context since the original ctx may already be expired.
+		revertStatus := "running"
+		pingCtx, pingCancel := context.WithTimeout(context.Background(), 10*time.Second)
+		if _, pingErr := agent.PingSandbox(pingCtx, connect.NewRequest(&pb.PingSandboxRequest{
+			SandboxId: sandboxIDStr,
+		})); pingErr != nil {
+			revertStatus = "error"
+			slog.Warn("sandbox gone from agent after failed pause, marking as error", "sandbox_id", sandboxIDStr)
+		}
+		pingCancel()
+		dbCtx, dbCancel := context.WithTimeout(context.Background(), 5*time.Second)
+		if _, dbErr := s.DB.UpdateSandboxStatus(dbCtx, db.UpdateSandboxStatusParams{
+			ID: sandboxID, Status: revertStatus,
 		}); dbErr != nil {
 			slog.Warn("failed to revert sandbox status after pause error", "sandbox_id", sandboxIDStr, "error", dbErr)
 		}
+		dbCancel()
 		return db.Sandbox{}, fmt.Errorf("agent pause: %w", err)
 	}