forked from wrenn/wrenn
Fix snapshot race, delete auth, sparse dd, default disk to 5GB
Snapshot race fix:
- Pre-mark sandbox as "paused" in DB before issuing CreateSnapshot and
PauseSandbox RPCs, preventing the reconciler from marking it "stopped"
during the flatten window when the sandbox is gone from the host
agent's in-memory map but DB still says "running"
- Revert status to "running" on RPC failure
- Check ctx.Err() before writing response to avoid writing to dead
connections when client disconnects during long snapshot operations
Delete auth fix:
- Block non-admin deletion of platform templates (team_id = all-zeros)
at DELETE /v1/snapshots/{name} with 403, preventing file deletion
before the team ownership check fails
Sparse dd:
- Add conv=sparse to dd in FlattenSnapshot so flattened images preserve
sparseness (~200MB actual vs 5GB logical)
Default disk size:
- Change default disk_size_mb from 20GB to 5GB across migration,
manager, service, build, and EnsureImageSizes
- Disable split-button dropdown arrow for platform templates in
dashboard snapshots page (teams cannot delete platform templates)
This commit is contained in:
@ -144,7 +144,7 @@ CREATE TABLE sandboxes (
|
|||||||
vcpus INTEGER NOT NULL DEFAULT 1,
|
vcpus INTEGER NOT NULL DEFAULT 1,
|
||||||
memory_mb INTEGER NOT NULL DEFAULT 512,
|
memory_mb INTEGER NOT NULL DEFAULT 512,
|
||||||
timeout_sec INTEGER NOT NULL DEFAULT 300,
|
timeout_sec INTEGER NOT NULL DEFAULT 300,
|
||||||
disk_size_mb INTEGER NOT NULL DEFAULT 20480,
|
disk_size_mb INTEGER NOT NULL DEFAULT 5120,
|
||||||
guest_ip TEXT NOT NULL DEFAULT '',
|
guest_ip TEXT NOT NULL DEFAULT '',
|
||||||
host_ip TEXT NOT NULL DEFAULT '',
|
host_ip TEXT NOT NULL DEFAULT '',
|
||||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
|||||||
@ -54,6 +54,7 @@ export type Snapshot = {
|
|||||||
memory_mb?: number;
|
memory_mb?: number;
|
||||||
size_bytes: number;
|
size_bytes: number;
|
||||||
created_at: string;
|
created_at: string;
|
||||||
|
platform: boolean;
|
||||||
};
|
};
|
||||||
|
|
||||||
export async function createSnapshot(sandboxId: string, name?: string): Promise<ApiResult<Snapshot>> {
|
export async function createSnapshot(sandboxId: string, name?: string): Promise<ApiResult<Snapshot>> {
|
||||||
|
|||||||
@ -423,6 +423,7 @@
|
|||||||
<div class="w-px shrink-0 bg-[var(--color-border-mid)]"></div>
|
<div class="w-px shrink-0 bg-[var(--color-border-mid)]"></div>
|
||||||
<!-- Chevron / dropdown trigger -->
|
<!-- Chevron / dropdown trigger -->
|
||||||
<button
|
<button
|
||||||
|
disabled={snapshot.platform}
|
||||||
onclick={(e) => {
|
onclick={(e) => {
|
||||||
e.stopPropagation();
|
e.stopPropagation();
|
||||||
if (openDropdownName === snapshot.name) {
|
if (openDropdownName === snapshot.name) {
|
||||||
@ -433,7 +434,7 @@
|
|||||||
openDropdownName = snapshot.name;
|
openDropdownName = snapshot.name;
|
||||||
}
|
}
|
||||||
}}
|
}}
|
||||||
class="flex items-center px-2 py-1.5 text-[var(--color-text-secondary)] transition-colors duration-150 hover:bg-[var(--color-bg-4)] hover:text-[var(--color-text-bright)]"
|
class="flex items-center px-2 py-1.5 text-[var(--color-text-secondary)] transition-colors duration-150 hover:bg-[var(--color-bg-4)] hover:text-[var(--color-text-bright)] disabled:cursor-not-allowed disabled:opacity-30 disabled:hover:bg-transparent disabled:hover:text-[var(--color-text-secondary)]"
|
||||||
>
|
>
|
||||||
<svg
|
<svg
|
||||||
class="transition-transform duration-150 {openDropdownName === snapshot.name ? 'rotate-180' : ''}"
|
class="transition-transform duration-150 {openDropdownName === snapshot.name ? 'rotate-180' : ''}"
|
||||||
@ -484,6 +485,7 @@
|
|||||||
class="fixed z-50 w-32 overflow-hidden rounded-[var(--radius-card)] border border-[var(--color-border-mid)] bg-[var(--color-bg-2)] py-1"
|
class="fixed z-50 w-32 overflow-hidden rounded-[var(--radius-card)] border border-[var(--color-border-mid)] bg-[var(--color-bg-2)] py-1"
|
||||||
style="top: {dropdownPos.top}px; left: {dropdownPos.left}px; animation: fadeUp 0.15s ease both"
|
style="top: {dropdownPos.top}px; left: {dropdownPos.left}px; animation: fadeUp 0.15s ease both"
|
||||||
>
|
>
|
||||||
|
{#if !dropdownSnapshot.platform}
|
||||||
<button
|
<button
|
||||||
onclick={(e) => {
|
onclick={(e) => {
|
||||||
e.stopPropagation();
|
e.stopPropagation();
|
||||||
@ -499,6 +501,7 @@
|
|||||||
</svg>
|
</svg>
|
||||||
Delete
|
Delete
|
||||||
</button>
|
</button>
|
||||||
|
{/if}
|
||||||
</div>
|
</div>
|
||||||
{/if}
|
{/if}
|
||||||
{/if}
|
{/if}
|
||||||
|
|||||||
@ -69,6 +69,7 @@ type snapshotResponse struct {
|
|||||||
MemoryMB *int32 `json:"memory_mb,omitempty"`
|
MemoryMB *int32 `json:"memory_mb,omitempty"`
|
||||||
SizeBytes int64 `json:"size_bytes"`
|
SizeBytes int64 `json:"size_bytes"`
|
||||||
CreatedAt string `json:"created_at"`
|
CreatedAt string `json:"created_at"`
|
||||||
|
Platform bool `json:"platform"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func templateToResponse(t db.Template) snapshotResponse {
|
func templateToResponse(t db.Template) snapshotResponse {
|
||||||
@ -76,6 +77,7 @@ func templateToResponse(t db.Template) snapshotResponse {
|
|||||||
Name: t.Name,
|
Name: t.Name,
|
||||||
Type: t.Type,
|
Type: t.Type,
|
||||||
SizeBytes: t.SizeBytes,
|
SizeBytes: t.SizeBytes,
|
||||||
|
Platform: t.TeamID == id.PlatformTeamID,
|
||||||
}
|
}
|
||||||
if t.Vcpus != 0 {
|
if t.Vcpus != 0 {
|
||||||
resp.VCPUs = &t.Vcpus
|
resp.VCPUs = &t.Vcpus
|
||||||
@ -154,26 +156,43 @@ func (h *snapshotHandler) Create(w http.ResponseWriter, r *http.Request) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
resp, err := agent.CreateSnapshot(ctx, connect.NewRequest(&pb.CreateSnapshotRequest{
|
// Pre-mark sandbox as "paused" in DB BEFORE issuing the snapshot RPC.
|
||||||
|
// The host agent's CreateSnapshot removes the sandbox from its in-memory
|
||||||
|
// map immediately; if the reconciler fires during the flatten window and
|
||||||
|
// the DB still says "running", it will mark the sandbox "stopped".
|
||||||
|
if sb.Status == "running" {
|
||||||
|
if _, err := h.db.UpdateSandboxStatus(ctx, db.UpdateSandboxStatusParams{
|
||||||
|
ID: sandboxID, Status: "paused",
|
||||||
|
}); err != nil {
|
||||||
|
writeError(w, http.StatusInternalServerError, "db_error", "failed to update sandbox status")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use a detached context with a generous timeout so the snapshot completes
|
||||||
|
// even if the client disconnects (the flatten step can take 10-20s).
|
||||||
|
snapCtx, snapCancel := context.WithTimeout(context.Background(), 5*time.Minute)
|
||||||
|
defer snapCancel()
|
||||||
|
|
||||||
|
resp, err := agent.CreateSnapshot(snapCtx, connect.NewRequest(&pb.CreateSnapshotRequest{
|
||||||
SandboxId: req.SandboxID,
|
SandboxId: req.SandboxID,
|
||||||
Name: req.Name,
|
Name: req.Name,
|
||||||
}))
|
}))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
// Snapshot failed — revert status back to what it was.
|
||||||
|
if sb.Status == "running" {
|
||||||
|
if _, dbErr := h.db.UpdateSandboxStatus(snapCtx, db.UpdateSandboxStatusParams{
|
||||||
|
ID: sandboxID, Status: "running",
|
||||||
|
}); dbErr != nil {
|
||||||
|
slog.Error("failed to revert sandbox status after snapshot error", "sandbox_id", req.SandboxID, "error", dbErr)
|
||||||
|
}
|
||||||
|
}
|
||||||
status, code, msg := agentErrToHTTP(err)
|
status, code, msg := agentErrToHTTP(err)
|
||||||
writeError(w, status, code, msg)
|
writeError(w, status, code, msg)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// Mark sandbox as paused (if it was running, it got paused by the snapshot).
|
tmpl, err := h.db.InsertTemplate(snapCtx, db.InsertTemplateParams{
|
||||||
if sb.Status != "paused" {
|
|
||||||
if _, err := h.db.UpdateSandboxStatus(ctx, db.UpdateSandboxStatusParams{
|
|
||||||
ID: sandboxID, Status: "paused",
|
|
||||||
}); err != nil {
|
|
||||||
slog.Error("failed to update sandbox status after snapshot", "sandbox_id", req.SandboxID, "error", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
tmpl, err := h.db.InsertTemplate(ctx, db.InsertTemplateParams{
|
|
||||||
Name: req.Name,
|
Name: req.Name,
|
||||||
Type: "snapshot",
|
Type: "snapshot",
|
||||||
Vcpus: sb.Vcpus,
|
Vcpus: sb.Vcpus,
|
||||||
@ -187,7 +206,12 @@ func (h *snapshotHandler) Create(w http.ResponseWriter, r *http.Request) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
h.audit.LogSnapshotCreate(r.Context(), ac, req.Name)
|
h.audit.LogSnapshotCreate(snapCtx, ac, req.Name)
|
||||||
|
|
||||||
|
if ctx.Err() != nil {
|
||||||
|
slog.Info("snapshot created but client disconnected before response", "name", req.Name)
|
||||||
|
return
|
||||||
|
}
|
||||||
writeJSON(w, http.StatusCreated, templateToResponse(tmpl))
|
writeJSON(w, http.StatusCreated, templateToResponse(tmpl))
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -220,10 +244,16 @@ func (h *snapshotHandler) Delete(w http.ResponseWriter, r *http.Request) {
|
|||||||
ctx := r.Context()
|
ctx := r.Context()
|
||||||
ac := auth.MustFromContext(ctx)
|
ac := auth.MustFromContext(ctx)
|
||||||
|
|
||||||
if _, err := h.db.GetTemplateByTeam(ctx, db.GetTemplateByTeamParams{Name: name, TeamID: ac.TeamID}); err != nil {
|
tmpl, err := h.db.GetTemplateByTeam(ctx, db.GetTemplateByTeamParams{Name: name, TeamID: ac.TeamID})
|
||||||
|
if err != nil {
|
||||||
writeError(w, http.StatusNotFound, "not_found", "template not found")
|
writeError(w, http.StatusNotFound, "not_found", "template not found")
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
// Platform templates can only be deleted by admins via /v1/admin/templates.
|
||||||
|
if tmpl.TeamID == id.PlatformTeamID {
|
||||||
|
writeError(w, http.StatusForbidden, "forbidden", "platform templates cannot be deleted here")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
if err := h.deleteSnapshotBroadcast(ctx, name); err != nil {
|
if err := h.deleteSnapshotBroadcast(ctx, name); err != nil {
|
||||||
writeError(w, http.StatusInternalServerError, "agent_error", "failed to delete snapshot files")
|
writeError(w, http.StatusInternalServerError, "agent_error", "failed to delete snapshot files")
|
||||||
|
|||||||
@ -224,6 +224,7 @@ func FlattenSnapshot(dmDevPath, outputPath string) error {
|
|||||||
"if="+dmDevPath,
|
"if="+dmDevPath,
|
||||||
"of="+outputPath,
|
"of="+outputPath,
|
||||||
"bs=4M",
|
"bs=4M",
|
||||||
|
"conv=sparse",
|
||||||
"status=none",
|
"status=none",
|
||||||
)
|
)
|
||||||
if out, err := cmd.CombinedOutput(); err != nil {
|
if out, err := cmd.CombinedOutput(); err != nil {
|
||||||
|
|||||||
@ -12,7 +12,7 @@ import (
|
|||||||
// than this are expanded at startup so that dm-snapshot sandboxes see the full
|
// than this are expanded at startup so that dm-snapshot sandboxes see the full
|
||||||
// size without per-sandbox copies. The expansion is sparse — only metadata
|
// size without per-sandbox copies. The expansion is sparse — only metadata
|
||||||
// changes; no physical disk is consumed beyond the original content.
|
// changes; no physical disk is consumed beyond the original content.
|
||||||
const DefaultDiskSizeMB = 20480 // 20 GB
|
const DefaultDiskSizeMB = 5120 // 5 GB
|
||||||
|
|
||||||
// EnsureImageSizes walks the images directory and expands any rootfs.ext4 that
|
// EnsureImageSizes walks the images directory and expands any rootfs.ext4 that
|
||||||
// is smaller than the target size. This is idempotent: images already at or
|
// is smaller than the target size. This is idempotent: images already at or
|
||||||
|
|||||||
@ -107,7 +107,7 @@ func (m *Manager) Create(ctx context.Context, sandboxID, template string, vcpus,
|
|||||||
memoryMB = 512
|
memoryMB = 512
|
||||||
}
|
}
|
||||||
if diskSizeMB <= 0 {
|
if diskSizeMB <= 0 {
|
||||||
diskSizeMB = 20480 // 20 GB default
|
diskSizeMB = 5120 // 5 GB default
|
||||||
}
|
}
|
||||||
|
|
||||||
if template == "" {
|
if template == "" {
|
||||||
|
|||||||
@ -213,7 +213,7 @@ func (s *BuildService) executeBuild(ctx context.Context, buildIDStr string) {
|
|||||||
Vcpus: build.Vcpus,
|
Vcpus: build.Vcpus,
|
||||||
MemoryMb: build.MemoryMb,
|
MemoryMb: build.MemoryMb,
|
||||||
TimeoutSec: 0, // no auto-pause for builds
|
TimeoutSec: 0, // no auto-pause for builds
|
||||||
DiskSizeMb: 20480, // 20 GB for template builds
|
DiskSizeMb: 5120, // 5 GB for template builds
|
||||||
}))
|
}))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
s.failBuild(ctx, buildID, fmt.Sprintf("create sandbox failed: %v", err))
|
s.failBuild(ctx, buildID, fmt.Sprintf("create sandbox failed: %v", err))
|
||||||
|
|||||||
@ -79,7 +79,7 @@ func (s *SandboxService) Create(ctx context.Context, p SandboxCreateParams) (db.
|
|||||||
p.MemoryMB = 512
|
p.MemoryMB = 512
|
||||||
}
|
}
|
||||||
if p.DiskSizeMB <= 0 {
|
if p.DiskSizeMB <= 0 {
|
||||||
p.DiskSizeMB = 20480 // 20 GB default
|
p.DiskSizeMB = 5120 // 5 GB default
|
||||||
}
|
}
|
||||||
|
|
||||||
// If the template is a snapshot, use its baked-in vcpus/memory.
|
// If the template is a snapshot, use its baked-in vcpus/memory.
|
||||||
@ -187,20 +187,32 @@ func (s *SandboxService) Pause(ctx context.Context, sandboxID, teamID pgtype.UUI
|
|||||||
|
|
||||||
sandboxIDStr := id.FormatSandboxID(sandboxID)
|
sandboxIDStr := id.FormatSandboxID(sandboxID)
|
||||||
|
|
||||||
|
// Pre-mark as "paused" in DB before the RPC so the reconciler does not
|
||||||
|
// mark the sandbox "stopped" while the host agent processes the pause.
|
||||||
|
if _, err := s.DB.UpdateSandboxStatus(ctx, db.UpdateSandboxStatusParams{
|
||||||
|
ID: sandboxID, Status: "paused",
|
||||||
|
}); err != nil {
|
||||||
|
return db.Sandbox{}, fmt.Errorf("pre-mark paused: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
// Flush all metrics tiers before pausing so data survives in DB.
|
// Flush all metrics tiers before pausing so data survives in DB.
|
||||||
s.flushAndPersistMetrics(ctx, agent, sandboxID, true)
|
s.flushAndPersistMetrics(ctx, agent, sandboxID, true)
|
||||||
|
|
||||||
if _, err := agent.PauseSandbox(ctx, connect.NewRequest(&pb.PauseSandboxRequest{
|
if _, err := agent.PauseSandbox(ctx, connect.NewRequest(&pb.PauseSandboxRequest{
|
||||||
SandboxId: sandboxIDStr,
|
SandboxId: sandboxIDStr,
|
||||||
})); err != nil {
|
})); err != nil {
|
||||||
|
// Revert status on failure.
|
||||||
|
if _, dbErr := s.DB.UpdateSandboxStatus(ctx, db.UpdateSandboxStatusParams{
|
||||||
|
ID: sandboxID, Status: "running",
|
||||||
|
}); dbErr != nil {
|
||||||
|
slog.Warn("failed to revert sandbox status after pause error", "sandbox_id", sandboxIDStr, "error", dbErr)
|
||||||
|
}
|
||||||
return db.Sandbox{}, fmt.Errorf("agent pause: %w", err)
|
return db.Sandbox{}, fmt.Errorf("agent pause: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
sb, err = s.DB.UpdateSandboxStatus(ctx, db.UpdateSandboxStatusParams{
|
sb, err = s.DB.GetSandbox(ctx, sandboxID)
|
||||||
ID: sandboxID, Status: "paused",
|
|
||||||
})
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return db.Sandbox{}, fmt.Errorf("update status: %w", err)
|
return db.Sandbox{}, fmt.Errorf("get sandbox after pause: %w", err)
|
||||||
}
|
}
|
||||||
return sb, nil
|
return sb, nil
|
||||||
}
|
}
|
||||||
|
|||||||
@ -34,8 +34,8 @@ type CreateSandboxRequest struct {
|
|||||||
// TTL in seconds. Sandbox is auto-paused after this duration of
|
// TTL in seconds. Sandbox is auto-paused after this duration of
|
||||||
// inactivity. 0 means no auto-pause.
|
// inactivity. 0 means no auto-pause.
|
||||||
TimeoutSec int32 `protobuf:"varint,4,opt,name=timeout_sec,json=timeoutSec,proto3" json:"timeout_sec,omitempty"`
|
TimeoutSec int32 `protobuf:"varint,4,opt,name=timeout_sec,json=timeoutSec,proto3" json:"timeout_sec,omitempty"`
|
||||||
// Disk size in MB for the sparse CoW file. Limits how much data the
|
// Disk size in MB for the rootfs. Base images are expanded to this size
|
||||||
// sandbox can write beyond the base image. Default: 20480 (20 GB).
|
// at host agent startup. Default: 5120 (5 GB).
|
||||||
DiskSizeMb int32 `protobuf:"varint,6,opt,name=disk_size_mb,json=diskSizeMb,proto3" json:"disk_size_mb,omitempty"`
|
DiskSizeMb int32 `protobuf:"varint,6,opt,name=disk_size_mb,json=diskSizeMb,proto3" json:"disk_size_mb,omitempty"`
|
||||||
unknownFields protoimpl.UnknownFields
|
unknownFields protoimpl.UnknownFields
|
||||||
sizeCache protoimpl.SizeCache
|
sizeCache protoimpl.SizeCache
|
||||||
|
|||||||
@ -86,8 +86,8 @@ message CreateSandboxRequest {
|
|||||||
// inactivity. 0 means no auto-pause.
|
// inactivity. 0 means no auto-pause.
|
||||||
int32 timeout_sec = 4;
|
int32 timeout_sec = 4;
|
||||||
|
|
||||||
// Disk size in MB for the sparse CoW file. Limits how much data the
|
// Disk size in MB for the rootfs. Base images are expanded to this size
|
||||||
// sandbox can write beyond the base image. Default: 20480 (20 GB).
|
// at host agent startup. Default: 5120 (5 GB).
|
||||||
int32 disk_size_mb = 6;
|
int32 disk_size_mb = 6;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user