1
0
forked from wrenn/wrenn

fix: resolve pause/snapshot failures and CoW exhaustion on large VMs

Remove hard 10s timeout from Firecracker HTTP client — callers already
pass context.Context with appropriate deadlines, and 20GB+ memfile
writes easily exceed 10s.

Ensure CoW file is at least as large as the origin rootfs. Previously,
WRENN_DEFAULT_ROOTFS_SIZE=30Gi expanded the base image to 30GB but the
default 5GB CoW could not hold all writes, causing dm-snapshot
invalidation and EIO on all guest I/O.

Destroy frozen VMs in resumeOnError instead of leaving zombies that
report "running" but can't execute. Use fresh context for the resume
attempt so a cancelled caller context doesn't falsely trigger destroy.

Increase CP→Agent ResponseHeaderTimeout from 45s to 5min and
PrepareSnapshot timeout from 3s to 30s for large-memory VMs.

After failed pause, ping agent to detect destroyed sandboxes and mark
DB status as "error" instead of reverting to "running".
This commit is contained in:
2026-05-04 01:46:57 +06:00
parent 1244c08e42
commit 51b5d7b3ba
4 changed files with 48 additions and 15 deletions

View File

@ -47,7 +47,7 @@ func NewHostClientPoolTLS(tlsCfg *tls.Config) *HostClientPool {
TLSNextProto: make(map[string]func(authority string, c *tls.Conn) http.RoundTripper),
MaxIdleConnsPerHost: 20,
IdleConnTimeout: 90 * time.Second,
ResponseHeaderTimeout: 45 * time.Second,
ResponseHeaderTimeout: 5 * time.Minute,
DialContext: (&net.Dialer{
Timeout: 10 * time.Second,
KeepAlive: 30 * time.Second,

View File

@ -239,12 +239,26 @@ func (s *SandboxService) Pause(ctx context.Context, sandboxID, teamID pgtype.UUI
if _, err := agent.PauseSandbox(ctx, connect.NewRequest(&pb.PauseSandboxRequest{
SandboxId: sandboxIDStr,
})); err != nil {
// Revert status on failure.
if _, dbErr := s.DB.UpdateSandboxStatus(ctx, db.UpdateSandboxStatusParams{
ID: sandboxID, Status: "running",
// Check if the agent still has this sandbox. If it was destroyed
// (e.g. frozen VM couldn't be resumed), mark as "error" instead of
// reverting to "running" — which would create a ghost record.
// Use a fresh context since the original ctx may already be expired.
revertStatus := "running"
pingCtx, pingCancel := context.WithTimeout(context.Background(), 10*time.Second)
if _, pingErr := agent.PingSandbox(pingCtx, connect.NewRequest(&pb.PingSandboxRequest{
SandboxId: sandboxIDStr,
})); pingErr != nil {
revertStatus = "error"
slog.Warn("sandbox gone from agent after failed pause, marking as error", "sandbox_id", sandboxIDStr)
}
pingCancel()
dbCtx, dbCancel := context.WithTimeout(context.Background(), 5*time.Second)
if _, dbErr := s.DB.UpdateSandboxStatus(dbCtx, db.UpdateSandboxStatusParams{
ID: sandboxID, Status: revertStatus,
}); dbErr != nil {
slog.Warn("failed to revert sandbox status after pause error", "sandbox_id", sandboxIDStr, "error", dbErr)
}
dbCancel()
return db.Sandbox{}, fmt.Errorf("agent pause: %w", err)
}