1
0
forked from wrenn/wrenn

Fix snapshot race, delete auth, sparse dd, default disk to 5GB

Snapshot race fix:
- Pre-mark sandbox as "paused" in DB before issuing CreateSnapshot and
  PauseSandbox RPCs, preventing the reconciler from marking it "stopped"
  during the flatten window when the sandbox is gone from the host
  agent's in-memory map but DB still says "running"
- Revert status to "running" on RPC failure
- Check ctx.Err() before writing response to avoid writing to dead
  connections when client disconnects during long snapshot operations

Delete auth fix:
- Block non-admin deletion of platform templates (team_id = all-zeros)
  at DELETE /v1/snapshots/{name} with 403, preventing file deletion
  before the team ownership check fails

Sparse dd:
- Add conv=sparse to dd in FlattenSnapshot so flattened images preserve
  sparseness (~200MB actual vs 5GB logical)

Default disk size:
- Change default disk_size_mb from 20GB to 5GB across migration,
  manager, service, build, and EnsureImageSizes
- Disable split-button dropdown arrow for platform templates in
  dashboard snapshots page (teams cannot delete platform templates)
This commit is contained in:
2026-03-28 14:30:18 +06:00
parent c89a664a37
commit 34af77e0d8
11 changed files with 89 additions and 42 deletions

View File

@ -79,7 +79,7 @@ func (s *SandboxService) Create(ctx context.Context, p SandboxCreateParams) (db.
p.MemoryMB = 512
}
if p.DiskSizeMB <= 0 {
p.DiskSizeMB = 20480 // 20 GB default
p.DiskSizeMB = 5120 // 5 GB default
}
// If the template is a snapshot, use its baked-in vcpus/memory.
@ -187,20 +187,32 @@ func (s *SandboxService) Pause(ctx context.Context, sandboxID, teamID pgtype.UUI
sandboxIDStr := id.FormatSandboxID(sandboxID)
// Pre-mark as "paused" in DB before the RPC so the reconciler does not
// mark the sandbox "stopped" while the host agent processes the pause.
if _, err := s.DB.UpdateSandboxStatus(ctx, db.UpdateSandboxStatusParams{
ID: sandboxID, Status: "paused",
}); err != nil {
return db.Sandbox{}, fmt.Errorf("pre-mark paused: %w", err)
}
// Flush all metrics tiers before pausing so data survives in DB.
s.flushAndPersistMetrics(ctx, agent, sandboxID, true)
if _, err := agent.PauseSandbox(ctx, connect.NewRequest(&pb.PauseSandboxRequest{
SandboxId: sandboxIDStr,
})); err != nil {
// Revert status on failure.
if _, dbErr := s.DB.UpdateSandboxStatus(ctx, db.UpdateSandboxStatusParams{
ID: sandboxID, Status: "running",
}); dbErr != nil {
slog.Warn("failed to revert sandbox status after pause error", "sandbox_id", sandboxIDStr, "error", dbErr)
}
return db.Sandbox{}, fmt.Errorf("agent pause: %w", err)
}
sb, err = s.DB.UpdateSandboxStatus(ctx, db.UpdateSandboxStatusParams{
ID: sandboxID, Status: "paused",
})
sb, err = s.DB.GetSandbox(ctx, sandboxID)
if err != nil {
return db.Sandbox{}, fmt.Errorf("update status: %w", err)
return db.Sandbox{}, fmt.Errorf("get sandbox after pause: %w", err)
}
return sb, nil
}