// Package sandbox: pause / resume / live-snapshot orchestration. // // Two high-level operations both built on the same CH primitives. Names use // wrenn.* vs ch.* so it is clear which layer a step belongs to. // // wrenn.snapshot = ch.pause + ch.snapshot + ch.resume // artefacts -> WRENN_DIR/images/teams/{teamID}/{templateID}/ // sandbox keeps running; dm-snapshot also flattened into // rootfs.ext4 so the dir is a self-contained template. // // wrenn.pause = ch.pause + ch.snapshot + ch.destroy // artefacts -> WRENN_DIR/sandboxes/{sandboxID}/ // VM torn down; CoW file at WRENN_DIR/sandboxes/{id}/rootfs.cow // + network slot retained so resume reaches the same host-IP. // // Pause always writes to a fresh staging directory and atomically swaps it // into place after ch.destroy releases CH's open fd to the previous // generation's memory-ranges (held via userfaultfd for lazy memory restore). // This is what makes pause-resume-pause-resume chains correct: an in-place // rewrite would risk CH reading from the file we are simultaneously // overwriting. // // CH 52+ writes memory-ranges as a sparse file via SEEK_DATA/SEEK_HOLE, // combined with `thp:false` + `free_page_reporting:true` on the balloon and // a pre-pause balloon inflation to reclaim guest free pages — no userspace // hole punching needed. package sandbox import ( "context" "encoding/json" "fmt" "log/slog" "os" "os/exec" "path/filepath" "strconv" "strings" "sync" "time" "github.com/jackc/pgx/v5/pgtype" "git.omukk.dev/wrenn/wrenn/internal/devicemapper" "git.omukk.dev/wrenn/wrenn/internal/layout" "git.omukk.dev/wrenn/wrenn/internal/models" "git.omukk.dev/wrenn/wrenn/internal/network" "git.omukk.dev/wrenn/wrenn/internal/snapshot" "git.omukk.dev/wrenn/wrenn/internal/vm" "git.omukk.dev/wrenn/wrenn/pkg/id" ) const ( // snapshotMetaFile is the per-snapshot metadata file holding the info // needed to restore the sandbox (template, resources, slot, etc.). snapshotMetaFile = "wrenn-snapshot.json" // drainTimeout is how long pause waits for in-flight proxy connections // to release before forcibly cancelling them. drainTimeout = 5 * time.Second // prepareSnapshotTimeout bounds the in-guest /snapshot/prepare call. // Short on purpose: envd PrepareSnapshot is best-effort, and a wedged // guest must not block the host-side pause path. prepareSnapshotTimeout = 5 * time.Second // vmInfoProbeTimeout bounds the CH /vm.info liveness probe issued // before destructive CH ops (pause/snapshot). Local unix-socket call — // kept tight so a dead socket fails fast. vmInfoProbeTimeout = 3 * time.Second // vmPauseTimeout bounds ch.pause. Pause itself is fast; the deadline // guards against a wedged CH unix socket hanging the request. vmPauseTimeout = 30 * time.Second ) // snapshotMeta is persisted into every snapshot directory. It captures the // minimum information needed to restore the sandbox or build a new sandbox // from a template, independent of the in-memory state in m.boxes. type snapshotMeta struct { // TemplateName is the human-readable template name. Set for snapshot // templates (CreateSnapshot); empty for pause snapshots. TemplateName string `json:"template_name,omitempty"` TeamID string `json:"team_id"` TemplateID string `json:"template_id"` VCPUs int `json:"vcpus"` MemoryMB int `json:"memory_mb"` TimeoutSec int `json:"timeout_sec"` // SlotIndex is the retained network slot. Only meaningful for pause // snapshots — resume re-acquires the same slot so the host-IP is stable. // Omitted for snapshot templates, which allocate a fresh slot per launch. SlotIndex int `json:"slot_index,omitempty"` BaseTemplate string `json:"base_template"` CowPath string `json:"cow_path,omitempty"` // SandboxDir pins the CH SandboxDir on restore — the tmpfs path baked // into CH's saved config.json. Always set: a restored sandbox gets a // fresh ID, but config.json keeps the tmpfs path of the sandbox the // snapshot was taken from, so the launcher must reconstruct it exactly. // For a snapshot-of-a-snapshot this is the root ancestor's path, carried // forward verbatim through the chain. SandboxDir string `json:"sandbox_dir"` CreatedAt time.Time `json:"created_at"` } // effectiveSandboxDir returns the tmpfs SandboxDir the running VM uses — the // path baked into CH's config.json. A fresh-boot sandbox derives it from its // own ID; a sandbox launched from a snapshot template inherits the override. func effectiveSandboxDir(sb *sandboxState) string { if sb.sandboxDirOverride != "" { return sb.sandboxDirOverride } return vm.SandboxTmpDir(sb.ID) } func writeSnapshotMeta(dir string, m *snapshotMeta) error { data, err := json.MarshalIndent(m, "", " ") if err != nil { return fmt.Errorf("marshal snapshot meta: %w", err) } if err := os.WriteFile(filepath.Join(dir, snapshotMetaFile), data, 0o644); err != nil { return fmt.Errorf("write snapshot meta: %w", err) } return nil } func readSnapshotMeta(dir string) (*snapshotMeta, error) { data, err := os.ReadFile(filepath.Join(dir, snapshotMetaFile)) if err != nil { return nil, fmt.Errorf("read snapshot meta: %w", err) } var meta snapshotMeta if err := json.Unmarshal(data, &meta); err != nil { return nil, fmt.Errorf("unmarshal snapshot meta: %w", err) } return &meta, nil } // Pause freezes the VM, persists the snapshot to WRENN_DIR/sandboxes/{id}/, // and tears down VM/network/dm resources. The CoW file is kept on disk so // Resume can pick up where the sandbox left off. // // The sandbox stays in m.boxes with Status=Paused. The cow file at // WRENN_DIR/sandboxes/{id}/rootfs.cow persists; on Resume it is re-attached // via devicemapper.RestoreSnapshot. // // Write strategy: snapshot is written into a fresh staging directory, the // VM is destroyed (closing CH's open fd to any previous-generation // memory-ranges), then the staging directory atomically replaces the // previous one via rename. This is essential for pause-resume-pause chains // where CH holds the old memory-ranges open via userfaultfd while we write // the new one. func (m *Manager) Pause(ctx context.Context, sandboxID string) error { sb, err := m.get(sandboxID) if err != nil { return err } sb.lifecycleMu.Lock() defer sb.lifecycleMu.Unlock() if sb.Status == models.StatusPaused { return nil } if sb.Status != models.StatusRunning { return fmt.Errorf("%w: %s (status: %s)", ErrNotRunning, sandboxID, sb.Status) } // Wait for the post-resume memory loader to finish before snapshotting. // Without this, ch.snapshot's SEEK_DATA/SEEK_HOLE writer would emit holes // for any page not yet faulted in, which read back as zero on the next // restore — silent corruption across pause/resume chains. if err := m.waitForMemoryLoader(ctx, sb); err != nil { return fmt.Errorf("pause %s: %w", sandboxID, err) } m.mu.Lock() sb.Status = models.StatusPausing m.mu.Unlock() finalDir := layout.PauseSnapshotDir(m.cfg.WrennDir, sandboxID) stageDir := layout.PauseStagingDir(m.cfg.WrennDir, sandboxID) rollbackToRunning := func(cause error, stage string) error { _ = os.RemoveAll(stageDir) // If the VM can't be unfrozen the sandbox is no longer usable. // Mark it Error so subsequent RPCs don't operate on a broken VM // (especially after a partial vm.snapshot which can leave CH wedged). if rerr := m.vm.Resume(context.Background(), sandboxID); rerr != nil { m.mu.Lock() sb.Status = models.StatusError m.mu.Unlock() sb.connTracker.Reset() return fmt.Errorf("pause %s: %s: %w (and resume failed: %v)", sandboxID, stage, cause, rerr) } sb.connTracker.Reset() m.mu.Lock() sb.Status = models.StatusRunning m.mu.Unlock() return fmt.Errorf("pause %s: %s: %w", sandboxID, stage, cause) } if err := m.quiesceAndPauseCH(ctx, sb); err != nil { return rollbackToRunning(err, "quiesce") } // Memory materialisation is handled out-of-band by the background loader // kicked off by Resume after /init. We blocked on it above (waitForMemoryLoader) // so by the time we reach ch.snapshot every guest page is resident in CH's // memfile and SEEK_DATA/SEEK_HOLE produces a self-contained snapshot. if err := os.MkdirAll(stageDir, 0o755); err != nil { return rollbackToRunning(err, "mkdir staging") } if err := m.vm.Snapshot(ctx, sandboxID, stageDir); err != nil { return rollbackToRunning(err, "snapshot") } // Punch zero pages CH wrote verbatim (guest had them dirty-then-free // without notifying the balloon driver). Best-effort; failures only // cost disk space. punchZeroPagesInDir(stageDir) meta := &snapshotMeta{ TeamID: id.UUIDString(pgtype.UUID{Bytes: sb.TemplateTeamID, Valid: true}), TemplateID: id.UUIDString(pgtype.UUID{Bytes: sb.TemplateID, Valid: true}), VCPUs: sb.VCPUs, MemoryMB: sb.MemoryMB, TimeoutSec: sb.TimeoutSec, SlotIndex: sb.SlotIndex, BaseTemplate: sb.baseImagePath, CowPath: sb.dmDevice.CowPath, SandboxDir: effectiveSandboxDir(sb), CreatedAt: time.Now(), } if err := writeSnapshotMeta(stageDir, meta); err != nil { // Without meta, Resume cannot reconstruct the sandbox. Treat as fatal. _ = os.RemoveAll(stageDir) return rollbackToRunning(err, "write meta") } // releaseRuntime destroys the VM, which closes CH's open fd to any // previous-generation memory-ranges. Must happen BEFORE we touch finalDir // so the swap is safe. It also tears down the dm-snapshot so the CoW file // inside finalDir is no longer held open and can be moved. m.releaseRuntime(sb, keepCow) // CoW lives at finalDir/rootfs.cow. swapDir replaces finalDir wholesale, // which would discard it. Move it into stageDir first so the swap carries // the CoW through alongside the new snapshot files. cowFinal := layout.SandboxCowPath(m.cfg.WrennDir, sandboxID) cowStage := filepath.Join(stageDir, layout.SandboxCowName) if err := os.Rename(cowFinal, cowStage); err != nil && !os.IsNotExist(err) { m.mu.Lock() sb.Status = models.StatusError m.mu.Unlock() return fmt.Errorf("pause %s: stage cow: %w", sandboxID, err) } if err := swapDir(stageDir, finalDir); err != nil { // CH is already destroyed — we cannot roll back to Running. The // staging snapshot is still on disk for forensic recovery. m.mu.Lock() sb.Status = models.StatusError m.mu.Unlock() return fmt.Errorf("pause %s: swap snapshot dir: %w", sandboxID, err) } m.mu.Lock() sb.Status = models.StatusPaused m.mu.Unlock() slog.Info("sandbox paused", "id", sandboxID, "snapshot_dir", finalDir) return nil } // swapDir atomically replaces final with stage. Any existing final dir is // moved aside to a uniquely-named trash dir before the swap so the rename // can succeed, then the trash is removed. // // Failure modes: // - move-old-to-trash fails: previous final dir is intact. stage remains. // - stage-to-final fails: we attempt to restore old from trash. If that // fails, the sandbox is wedged but stage still holds valid data. // - trash removal fails: previous generation is orphaned, will be GC'd // on next agent startup. func swapDir(stage, final string) error { trash := final + ".trash-" + strconv.FormatInt(time.Now().UnixNano(), 10) hadOld := true if _, err := os.Stat(final); err != nil { if !os.IsNotExist(err) { return fmt.Errorf("stat existing final dir: %w", err) } hadOld = false } if hadOld { if err := os.Rename(final, trash); err != nil { return fmt.Errorf("move old final to trash: %w", err) } } if err := os.Rename(stage, final); err != nil { // Try to put the old one back. if hadOld { if rerr := os.Rename(trash, final); rerr != nil { slog.Warn("could not restore previous snapshot dir after failed swap", "trash", trash, "final", final, "error", rerr) } } return fmt.Errorf("move stage to final: %w", err) } if hadOld { if err := os.RemoveAll(trash); err != nil { slog.Warn("could not remove trashed snapshot dir", "path", trash, "error", err) } } return nil } // quiesceAndPauseCH drains envd connections, asks envd to quiesce its own // state, then issues ch.pause. On return the VM is frozen and ready for // ch.snapshot. Caller must either ch.resume or ch.destroy afterwards. // // Snapshot-size optimisation relies on virtio-balloon's free_page_reporting: // envd drops the VFS page cache + fstrim + a settle window inside // /snapshot/prepare, which gives the guest balloon driver time to report all // the now-free pages to the host. CH punches those reports out of the backing // memfile and v52+'s SEEK_DATA/SEEK_HOLE snapshot writer skips them. No // explicit balloon inflate is required — inflation would constrain the guest // post-resume (forced re-allocation of large free regions), and free_page_ // reporting drains everything we'd have inflated anyway. func (m *Manager) quiesceAndPauseCH(ctx context.Context, sb *sandboxState) error { sb.connTracker.Drain(drainTimeout) sb.connTracker.ForceClose() if c := sb.client.Load(); c != nil { // Bound the in-guest prepare call. If envd is wedged or the netns // is half-torn-down the connect/read can block for the full envd // client timeout (2m), which the user perceives as a hung snapshot. prepCtx, prepCancel := context.WithTimeout(ctx, prepareSnapshotTimeout) err := c.PrepareSnapshot(prepCtx) prepCancel() if err != nil { slog.Warn("envd prepare-snapshot failed (continuing)", "id", sb.ID, "error", err) } c.CloseIdleConnections() } // Verify CH is still alive before issuing destructive ops. Without this // a second snapshot attempt against a sandbox whose CH process died // would block on vm.pause until the unix-socket dial times out. probeCtx, probeCancel := context.WithTimeout(ctx, vmInfoProbeTimeout) state, err := m.vm.Info(probeCtx, sb.ID) probeCancel() if err != nil { return fmt.Errorf("ch.vm.info probe: %w", err) } if state != "Running" { return fmt.Errorf("ch.vm.info: VM in state %q, not Running", state) } pauseCtx, pauseCancel := context.WithTimeout(ctx, vmPauseTimeout) defer pauseCancel() if err := m.vm.Pause(pauseCtx, sb.ID); err != nil { return fmt.Errorf("ch.pause: %w", err) } return nil } // promoteSnapshotDir moves every regular file from srcDir into dstDir using // rename(2). Renames are per-file so an existing rootfs.ext4 inside dstDir // that is currently held open by a loop device keeps its inode (the directory // entry is replaced, but the open fd still references the old inode). srcDir // is removed on success. func promoteSnapshotDir(srcDir, dstDir string) error { if err := os.MkdirAll(dstDir, 0o755); err != nil { return fmt.Errorf("mkdir dst: %w", err) } entries, err := os.ReadDir(srcDir) if err != nil { return fmt.Errorf("read staging: %w", err) } for _, e := range entries { from := filepath.Join(srcDir, e.Name()) to := filepath.Join(dstDir, e.Name()) if err := os.Rename(from, to); err != nil { return fmt.Errorf("rename %s: %w", e.Name(), err) } } return os.RemoveAll(srcDir) } // releaseRuntime tears down VM, network, dm-snapshot, and loop refcount for // a paused sandbox. The CoW file is preserved when keep == keepCow so Resume // can re-attach it. type cowDisposition int const ( keepCow cowDisposition = iota dropCow ) func (m *Manager) releaseRuntime(sb *sandboxState, cow cowDisposition) { // Cancel any background memory loader (UFFD page faulter) before // destroying the VM. Without this, the loader keeps trying to fault // pages into a vanished guest and races with sb.client being cleared // below. Mirror the cleanup() pattern. if sb.memLoadCancel != nil { sb.memLoadCancel() if sb.memLoadDone != nil { <-sb.memLoadDone } } m.stopSampler(sb) if err := m.vm.Destroy(context.Background(), sb.ID); err != nil { slog.Warn("vm destroy on pause", "id", sb.ID, "error", err) } if err := network.RemoveNetwork(sb.slot); err != nil { slog.Warn("network remove on pause", "id", sb.ID, "error", err) } // Retain the slot when keeping the CoW (pause): Resume must re-acquire // the same SlotIndex so the sandbox's host-IP stays stable. Releasing // here lets a subsequent Create steal slot 1 while we're paused, and // Resume's slots.Reserve() then fails with "slot already in use". if cow == dropCow { m.slots.Release(sb.SlotIndex) } if sb.dmDevice != nil { if err := devicemapper.RemoveSnapshot(context.Background(), sb.dmDevice); err != nil { slog.Warn("dm-snapshot remove on pause", "id", sb.ID, "error", err) } if cow == dropCow { os.Remove(sb.dmDevice.CowPath) } } if sb.baseImagePath != "" { m.loops.Release(sb.baseImagePath) } // Clear runtime references; they're rebuilt on resume. sb.slot = nil sb.client.Store(nil) sb.dmDevice = nil } // Resume re-launches a paused sandbox from its on-disk snapshot. The same // SlotIndex is reserved so the sandbox keeps its host-IP. The dm-snapshot // is re-attached to the existing CoW file, then CH is launched with // --restore. Memory faults in lazily via userfaultfd. // // The snapshot directory is NOT deleted after a successful resume: CH keeps // an open fd to memory-ranges for lazy page faulting throughout the VM's // lifetime. The next Pause writes to a fresh staging dir and swaps; only // then is the previous generation discarded. // // The remaining args (defaultUser, env, etc.) are forwarded to envd's /init // so the resumed sandbox sees the same execution environment as before. func (m *Manager) Resume(ctx context.Context, sandboxID string, timeoutSec int, defaultUser, _ string, envVars map[string]string) (*models.Sandbox, error) { if m.draining.Load() { return nil, ErrDraining } sb, err := m.get(sandboxID) if err != nil { return nil, err } sb.lifecycleMu.Lock() defer sb.lifecycleMu.Unlock() if sb.Status == models.StatusRunning { return &sb.Sandbox, nil } if sb.Status != models.StatusPaused { return nil, fmt.Errorf("%w: %s (status: %s)", ErrNotPaused, sandboxID, sb.Status) } snapDir := layout.PauseSnapshotDir(m.cfg.WrennDir, sandboxID) meta, err := readSnapshotMeta(snapDir) if err != nil { return nil, fmt.Errorf("load snapshot meta: %w", err) } resumed, err := m.resumeFromMeta(ctx, sb, meta, snapDir) if err != nil { // resumeFromMeta rolled back its own runtime resources. Leave the // sandbox in Paused state so the caller can retry — the on-disk // snapshot and slot reservation are intact. Evicting from m.boxes // would orphan a recoverable sandbox: DB still says paused but the // agent would return NotFound on retry. m.mu.Lock() sb.Status = models.StatusPaused m.mu.Unlock() return nil, err } // Single /init then start the memory loader. See initAndStartMemoryLoader // for the ordering rationale (init resets envd atomics that the loader // then re-arms — reversing the order silently corrupts the next snapshot). m.initAndStartMemoryLoader(ctx, resumed, defaultUser, id.UUIDString(pgtype.UUID{Bytes: sb.TemplateID, Valid: true}), envVars) if timeoutSec > 0 { m.mu.Lock() sb.TimeoutSec = clampTimeout(timeoutSec) m.mu.Unlock() } return &sb.Sandbox, nil } // resumeFromMeta wires up the runtime resources (loop, dm-snapshot, network, // CH process) for a paused sandbox and waits until envd is ready. // // On any failure the partial setup is rolled back so the sandbox stays in // a clean Paused state. func (m *Manager) resumeFromMeta(ctx context.Context, sb *sandboxState, meta *snapshotMeta, snapDir string) (*sandboxState, error) { // 1. Re-acquire the shared loop device for the base template. originLoop, err := m.loops.Acquire(meta.BaseTemplate) if err != nil { return nil, fmt.Errorf("acquire loop: %w", err) } originSize, err := devicemapper.OriginSizeBytes(originLoop) if err != nil { m.loops.Release(meta.BaseTemplate) return nil, fmt.Errorf("origin size: %w", err) } // 2. Re-attach the dm-snapshot using the persistent CoW file. dmName := "wrenn-" + sb.ID dmDev, err := devicemapper.RestoreSnapshot(ctx, dmName, originLoop, meta.CowPath, originSize) if err != nil { m.loops.Release(meta.BaseTemplate) return nil, fmt.Errorf("restore dm-snapshot: %w", err) } // 3. Slot is already held continuously from Create through Pause — // the allocator never released it on Pause, so the SlotIndex from meta // is still reserved for this sandbox. Just rebuild the Slot struct. slot := network.NewSlot(meta.SlotIndex) if err := network.CreateNetwork(slot); err != nil { if rmErr := devicemapper.RemoveSnapshot(context.Background(), dmDev); rmErr != nil { slog.Warn("dm remove during resume rollback", "id", sb.ID, "error", rmErr) } m.loops.Release(meta.BaseTemplate) return nil, fmt.Errorf("create network: %w", err) } rollback := func() { warnErr("network remove during resume rollback", sb.ID, network.RemoveNetwork(slot)) // Slot stays reserved across pause/resume — released only on Destroy. warnErr("dm remove during resume rollback", sb.ID, devicemapper.RemoveSnapshot(context.Background(), dmDev)) m.loops.Release(meta.BaseTemplate) } // 4-6. Launch CH in restore mode, wait envd, deflate balloon. Sandbox // keeps its original ID/SandboxDir so the disk path baked into // config.json (`/tmp/ch-vm-{originalID}/rootfs.ext4`) resolves to the // re-attached dm device via the tmpfs symlink set up by the launcher. vmCfg := m.buildRestoreVMConfig(restoreInputs{ sandboxID: sb.ID, templateID: id.UUIDString(pgtype.UUID{Bytes: sb.TemplateID, Valid: true}), snapDir: snapDir, rootfsPath: dmDev.DevicePath, vcpus: meta.VCPUs, memoryMB: meta.MemoryMB, slot: slot, sandboxDir: meta.SandboxDir, }) client, err := m.launchRestoredVM(ctx, vmCfg, slot.HostIP.String()) if err != nil { rollback() return nil, err } // /init is invoked once by the outer Resume so a single lifecycle bump // reaches envd. (Calling it here too would double-restart port forwarder.) // 7. Re-hydrate in-memory state. m.mu.Lock() sb.slot = slot sb.client.Store(client) sb.dmDevice = dmDev sb.sandboxDirOverride = meta.SandboxDir // baseImagePath pairs the loop refcount we just Acquire'd with the // matching Release inside cleanup() / releaseRuntime(). For a sandbox // rehydrated from RestorePausedSandboxes this is the first time // baseImagePath is populated — the restored entry intentionally leaves // it empty so a Destroy-before-Resume cannot underflow the registry. sb.baseImagePath = meta.BaseTemplate sb.connTracker.Reset() sb.HostIP = slot.HostIP sb.RootfsPath = dmDev.DevicePath sb.LastActiveAt = time.Now() sb.Status = models.StatusRunning m.mu.Unlock() m.startSampler(sb) m.startCrashWatcher(sb) // Background memory loader is started by the outer Resume AFTER /init // completes — see comment there for the race rationale. slog.Info("sandbox resumed", "id", sb.ID, "host_ip", slot.HostIP.String()) return sb, nil } // startMemoryLoader spawns the background goroutine that asks envd to read // every guest physical page so subsequent snapshots are self-contained. The // goroutine is cancellable via sb.memLoadCancel and closes sb.memLoadDone on // exit. Must be called with sb in StatusRunning and sb.client populated. func (m *Manager) startMemoryLoader(sb *sandboxState) { loadCtx, cancel := context.WithCancel(context.Background()) done := make(chan struct{}) m.mu.Lock() sb.memLoadCancel = cancel sb.memLoadDone = done m.mu.Unlock() go func() { defer close(done) client := sb.client.Load() if client == nil { return } started := time.Now() // Kick the loader off in envd. The POST returns as soon as the // background thread is queued — actual materialisation continues // inside envd independent of this connection. startCtx, startCancel := context.WithTimeout(loadCtx, 30*time.Second) if _, err := client.StartMemoryPreload(startCtx); err != nil { startCancel() if loadCtx.Err() != nil { slog.Debug("memory preload start cancelled", "id", sb.ID) return } slog.Warn("memory preload start failed", "id", sb.ID, "error", err) return } startCancel() // Poll envd for completion. Polling interval is coarse (1s) since the // loader runs for many seconds; the polls just check an atomic. status, err := client.WaitMemoryPreload(loadCtx) if err != nil { if loadCtx.Err() != nil { slog.Debug("memory preload wait cancelled", "id", sb.ID) return } slog.Warn("memory preload wait failed", "id", sb.ID, "error", err) return } if status.State != "done" { slog.Warn("memory preload finished abnormally", "id", sb.ID, "state", status.State, "error", status.Error, "pages", status.Pages, "bytes", status.Bytes, "source", status.Source, ) return } slog.Info("memory preload complete", "id", sb.ID, "elapsed", time.Since(started), "pages", status.Pages, "bytes", status.Bytes, "source", status.Source, ) }() } // waitForMemoryLoader blocks until the background memory loader finishes, or // until ctx is cancelled. Returns nil if the loader is already done or not // running. A pause must wait on this before ch.snapshot so the resulting // memory-ranges is self-contained. func (m *Manager) waitForMemoryLoader(ctx context.Context, sb *sandboxState) error { m.mu.RLock() done := sb.memLoadDone m.mu.RUnlock() if done == nil { return nil } select { case <-done: return nil case <-ctx.Done(): return fmt.Errorf("wait for memory loader: %w", ctx.Err()) } } // CreateSnapshot writes a self-contained template snapshot to // WRENN_DIR/images/teams/{teamID}/{templateID}/, then returns the total size // (in bytes) of the artefacts written. // // A running sandbox is snapshotted live (briefly paused, memory dumped, rootfs // flattened, then resumed). A paused sandbox is snapshotted straight from its // on-disk pause artefacts without reviving the VM — it stays paused. func (m *Manager) CreateSnapshot(ctx context.Context, sandboxID string, teamID, templateID pgtype.UUID, name string) (int64, error) { sb, err := m.get(sandboxID) if err != nil { return 0, err } sb.lifecycleMu.Lock() defer sb.lifecycleMu.Unlock() // Refuse silent overwrites: every snapshot must land in a fresh // templateID. Defends against caller bugs and concurrent CreateSnapshot // races for the same destination. User-facing snapshot-name uniqueness // is also enforced by the CP at the templates table. if m.templateExists(teamID, templateID) { return 0, fmt.Errorf("snapshot template %s/%s already exists", id.UUIDString(teamID), id.UUIDString(templateID)) } switch sb.Status { case models.StatusRunning: return m.snapshotRunningToTemplate(ctx, sb, teamID, templateID, name) case models.StatusPaused: return m.snapshotPausedToTemplate(ctx, sb, teamID, templateID, name) default: return 0, fmt.Errorf("%w: %s (status: %s)", ErrNotRunning, sandboxID, sb.Status) } } // snapshotRunningToTemplate takes a live snapshot of a running sandbox: pause // CH, dump memory + flatten the rootfs into a staging dir, resume CH, then // promote the staged template into place. The sandbox returns to running. func (m *Manager) snapshotRunningToTemplate(ctx context.Context, sb *sandboxState, teamID, templateID pgtype.UUID, name string) (int64, error) { sandboxID := sb.ID // Same rationale as Pause: wait for the background memory loader so the // resulting memory-ranges is self-contained when this sandbox itself was // previously restored from an ondemand snapshot. if err := m.waitForMemoryLoader(ctx, sb); err != nil { return 0, fmt.Errorf("create snapshot %s: %w", sandboxID, err) } dstDir := layout.TemplateDir(m.cfg.WrennDir, teamID, templateID) stageDir := filepath.Join(layout.SandboxesDir(m.cfg.WrennDir), fmt.Sprintf(".stage-%s-%d", sandboxID, time.Now().UnixNano())) if err := os.MkdirAll(stageDir, 0o755); err != nil { return 0, fmt.Errorf("mkdir stage dir: %w", err) } defer os.RemoveAll(stageDir) // Quiesce + ch.pause + ch.snapshot into a staging dir. The final dst // may contain the sandbox's own base rootfs.ext4 held open via the loop // device; writing through a staging dir + per-file rename avoids // unlinking that inode while the loop still references it. if err := m.quiesceAndPauseCH(ctx, sb); err != nil { _ = m.vm.Resume(context.Background(), sandboxID) sb.connTracker.Reset() return 0, err } if err := m.vm.Snapshot(ctx, sandboxID, stageDir); err != nil { _ = m.vm.Resume(context.Background(), sandboxID) sb.connTracker.Reset() return 0, fmt.Errorf("vm.snapshot: %w", err) } punchZeroPagesInDir(stageDir) // Flatten dm-snapshot → rootfs.ext4. Reads through the dm device which is // stable while CH is paused. rootfsOut := filepath.Join(stageDir, "rootfs.ext4") if err := devicemapper.FlattenSnapshot(sb.dmDevice.DevicePath, rootfsOut); err != nil { // Resume so the sandbox doesn't get stuck. Caller sees the error. if rerr := m.vm.Resume(context.Background(), sandboxID); rerr != nil { slog.Warn("vm resume after flatten failure", "id", sandboxID, "error", rerr) } sb.connTracker.Reset() return 0, fmt.Errorf("flatten rootfs: %w", err) } // SlotIndex is intentionally omitted: a snapshot template allocates a // fresh network slot on every launch, so the source sandbox's slot is // meaningless. SandboxDir, however, must be recorded — see snapshotMeta. meta := &snapshotMeta{ TemplateName: name, TeamID: id.UUIDString(teamID), TemplateID: id.UUIDString(templateID), VCPUs: sb.VCPUs, MemoryMB: sb.MemoryMB, TimeoutSec: sb.TimeoutSec, BaseTemplate: sb.baseImagePath, SandboxDir: effectiveSandboxDir(sb), CreatedAt: time.Now(), } if err := writeSnapshotMeta(stageDir, meta); err != nil { slog.Warn("template meta write failed", "id", sandboxID, "error", err) } // Resume the live sandbox; the staged snapshot is fully written. // On resume failure we still Reset the connTracker: leaving it draining // would refuse all subsequent proxy connections even though the VM is // effectively running (just wedged on the CH side). The error returned // to the caller surfaces the wedge state. if err := m.vm.Resume(ctx, sandboxID); err != nil { sb.connTracker.Reset() return 0, fmt.Errorf("vm resume after live snapshot: %w", err) } sb.connTracker.Reset() // Promote staging → final destination via per-file rename. if err := promoteSnapshotDir(stageDir, dstDir); err != nil { return 0, fmt.Errorf("promote snapshot: %w", err) } // Tell envd to refresh its clock and lifecycle. Brief pause means clock // drift is usually <1s but PostInit is cheap. if c := sb.client.Load(); c != nil { if err := c.PostInit(ctx); err != nil { slog.Warn("envd PostInit after live snapshot", "id", sandboxID, "error", err) } } size, err := snapshot.DirSize(dstDir, "") if err != nil { slog.Warn("snapshot size calc failed", "id", sandboxID, "error", err) } slog.Info("live snapshot created", "id", sandboxID, "team_id", teamID, "template_id", templateID, "dir", dstDir, "bytes", size, ) return size, nil } // snapshotPausedToTemplate builds a self-contained template from a paused // sandbox's on-disk artefacts without reviving the VM. The pause snapshot // already holds a self-contained CH memory image (Pause blocks on the memory // loader before snapshotting), so we copy those memory files verbatim and // flatten the persistent CoW into rootfs.ext4. The sandbox stays Paused. func (m *Manager) snapshotPausedToTemplate(ctx context.Context, sb *sandboxState, teamID, templateID pgtype.UUID, name string) (int64, error) { snapDir := layout.PauseSnapshotDir(m.cfg.WrennDir, sb.ID) meta, err := readSnapshotMeta(snapDir) if err != nil { return 0, fmt.Errorf("load pause snapshot meta: %w", err) } dstDir := layout.TemplateDir(m.cfg.WrennDir, teamID, templateID) stageDir := filepath.Join(layout.SandboxesDir(m.cfg.WrennDir), fmt.Sprintf(".stage-%s-%d", sb.ID, time.Now().UnixNano())) if err := os.MkdirAll(stageDir, 0o755); err != nil { return 0, fmt.Errorf("mkdir stage dir: %w", err) } defer os.RemoveAll(stageDir) // Flatten the persistent CoW into a standalone rootfs.ext4. The VM is down, // so re-attach a throwaway dm-snapshot over the base image + CoW just long // enough to read through it; the CoW file is left intact for a later Resume. if err := m.flattenPausedCow(ctx, sb.ID, meta, filepath.Join(stageDir, "rootfs.ext4")); err != nil { return 0, err } // Copy CH's memory snapshot files verbatim (state.json, config.json, // memory-ranges, …) — everything except the CoW and the pause meta, which // the template replaces with its own rootfs.ext4 and meta below. if err := copyMemorySnapshotFiles(snapDir, stageDir); err != nil { return 0, err } // Template meta: no SlotIndex (a template allocates a fresh slot per launch); // SandboxDir + BaseTemplate carried forward so the restore path resolves the // tmpfs disk path baked into CH's config.json. tmplMeta := &snapshotMeta{ TemplateName: name, TeamID: id.UUIDString(teamID), TemplateID: id.UUIDString(templateID), VCPUs: meta.VCPUs, MemoryMB: meta.MemoryMB, TimeoutSec: meta.TimeoutSec, BaseTemplate: meta.BaseTemplate, SandboxDir: meta.SandboxDir, CreatedAt: time.Now(), } if err := writeSnapshotMeta(stageDir, tmplMeta); err != nil { slog.Warn("template meta write failed", "id", sb.ID, "error", err) } if err := promoteSnapshotDir(stageDir, dstDir); err != nil { return 0, fmt.Errorf("promote snapshot: %w", err) } size, err := snapshot.DirSize(dstDir, "") if err != nil { slog.Warn("snapshot size calc failed", "id", sb.ID, "error", err) } slog.Info("paused snapshot created", "id", sb.ID, "team_id", teamID, "template_id", templateID, "dir", dstDir, "bytes", size, ) return size, nil } // flattenPausedCow re-attaches a temporary dm-snapshot over a paused sandbox's // base image + persistent CoW, flattens it into outPath, then tears the dm // device down. The CoW file is preserved (RemoveSnapshot never deletes it) so a // later Resume still works. A distinct dm name avoids colliding with the // "wrenn-{id}" device a concurrent Resume would create — though lifecycleMu // already serialises the two. func (m *Manager) flattenPausedCow(ctx context.Context, sandboxID string, meta *snapshotMeta, outPath string) error { originLoop, err := m.loops.Acquire(meta.BaseTemplate) if err != nil { return fmt.Errorf("acquire loop: %w", err) } defer m.loops.Release(meta.BaseTemplate) originSize, err := devicemapper.OriginSizeBytes(originLoop) if err != nil { return fmt.Errorf("origin size: %w", err) } dmDev, err := devicemapper.RestoreSnapshot(ctx, "wrenn-flat-"+sandboxID, originLoop, meta.CowPath, originSize) if err != nil { return fmt.Errorf("restore dm-snapshot: %w", err) } defer func() { if rerr := devicemapper.RemoveSnapshot(context.Background(), dmDev); rerr != nil { slog.Warn("dm remove after paused flatten", "id", sandboxID, "error", rerr) } }() if err := devicemapper.FlattenSnapshot(dmDev.DevicePath, outPath); err != nil { return fmt.Errorf("flatten rootfs: %w", err) } return nil } // copyMemorySnapshotFiles copies every regular file from a pause snapshot dir // into dstDir except the CoW and the wrenn meta — i.e. CH's own memory snapshot // artefacts (state.json, config.json, memory-ranges, …). It hardlinks when the // dirs share a filesystem (instant, preserves sparseness) and falls back to a // sparse-preserving copy across filesystems. Pause never mutates these files in // place — the next Pause writes a fresh dir and swaps — so a hardlink stays a // valid, immutable view for the template. func copyMemorySnapshotFiles(srcDir, dstDir string) error { entries, err := os.ReadDir(srcDir) if err != nil { return fmt.Errorf("read pause dir: %w", err) } for _, e := range entries { if e.IsDir() { continue } name := e.Name() if name == layout.SandboxCowName || name == snapshotMetaFile { continue } if err := linkOrCopyFile(filepath.Join(srcDir, name), filepath.Join(dstDir, name)); err != nil { return fmt.Errorf("copy %s: %w", name, err) } } return nil } // linkOrCopyFile hardlinks from→to, falling back to a sparse-preserving copy // when the two paths live on different filesystems (os.Link returns EXDEV). A // plain byte copy would materialise the zero pages punched out of memory-ranges // — inflating a multi-GB snapshot to its full apparent size — so the fallback // uses `cp --sparse=always`, which re-detects and re-punches the holes. func linkOrCopyFile(from, to string) error { if err := os.Link(from, to); err == nil { return nil } if out, err := exec.Command("cp", "--sparse=always", from, to).CombinedOutput(); err != nil { return fmt.Errorf("sparse copy: %s: %w", string(out), err) } return nil } // DeleteSnapshot removes a template snapshot directory. Refuses deletion // while any in-memory sandbox is still derived from this template — even // though Linux unlink lets the open loop device keep working, the agent // would be unable to re-acquire it after a restart and a concurrent // LoopRegistry.Acquire would fail mid-flight. func (m *Manager) DeleteSnapshot(teamID, templateID pgtype.UUID) error { m.mu.RLock() var users []string for sbID, sb := range m.boxes { if sb.TemplateTeamID == teamID.Bytes && sb.TemplateID == templateID.Bytes { users = append(users, sbID) } } m.mu.RUnlock() if len(users) > 0 { return fmt.Errorf("snapshot %s/%s is in use by %d sandbox(es): %v", id.UUIDString(teamID), id.UUIDString(templateID), len(users), users) } dir := layout.TemplateDir(m.cfg.WrennDir, teamID, templateID) if err := os.RemoveAll(dir); err != nil { return fmt.Errorf("remove snapshot dir: %w", err) } // Prune the parent team directory if this was the team's last template, // so deleting a template leaves no residual directory behind. pruneEmptyDir(filepath.Dir(dir)) slog.Info("template snapshot deleted", "team_id", teamID, "template_id", templateID) return nil } // pruneEmptyDir removes dir only when it is empty. Best-effort: a non-empty // dir or any filesystem error is silently ignored. Used to clean up a team's // template parent directory once its last template has been removed. func pruneEmptyDir(dir string) { entries, err := os.ReadDir(dir) if err != nil || len(entries) > 0 { return } if err := os.Remove(dir); err != nil { slog.Warn("prune empty template dir", "path", dir, "error", err) } } // FlattenRootfs writes the current dm-snapshot state to a new template // rootfs without taking a memory snapshot. Used to publish a sandbox's // disk-only state as a base image. The sandbox is briefly paused for I/O // consistency. func (m *Manager) FlattenRootfs(ctx context.Context, sandboxID string, teamID, templateID pgtype.UUID) (int64, error) { sb, err := m.get(sandboxID) if err != nil { return 0, err } sb.lifecycleMu.Lock() defer sb.lifecycleMu.Unlock() if sb.Status != models.StatusRunning { return 0, fmt.Errorf("%w: %s (status: %s)", ErrNotRunning, sandboxID, sb.Status) } dstDir := layout.TemplateDir(m.cfg.WrennDir, teamID, templateID) stageDir := filepath.Join(layout.SandboxesDir(m.cfg.WrennDir), fmt.Sprintf(".stage-%s-%d", sandboxID, time.Now().UnixNano())) if err := os.MkdirAll(stageDir, 0o755); err != nil { return 0, fmt.Errorf("mkdir stage dir: %w", err) } defer os.RemoveAll(stageDir) // quiesceAndPauseCH drains connections and calls envd /snapshot/prepare // (sync + drop_caches) before ch.pause. A plain ch.pause only freezes the // vCPUs — guest VFS page-cache writes (e.g. freshly pip-installed files) // would not yet have reached the block device, so the flattened rootfs // would capture empty files. Matches CreateSnapshot and Pause. if err := m.quiesceAndPauseCH(ctx, sb); err != nil { // quiesceAndPauseCH force-closes tracked connections before ch.pause. // On failure, resume and reset so the sandbox doesn't get stuck // refusing new proxy connections. Mirrors CreateSnapshot. _ = m.vm.Resume(context.Background(), sandboxID) sb.connTracker.Reset() return 0, fmt.Errorf("quiesce for flatten: %w", err) } flattenErr := devicemapper.FlattenSnapshot(sb.dmDevice.DevicePath, filepath.Join(stageDir, "rootfs.ext4")) if rerr := m.vm.Resume(context.Background(), sandboxID); rerr != nil { slog.Warn("vm resume after flatten", "id", sandboxID, "error", rerr) } sb.connTracker.Reset() if flattenErr != nil { return 0, fmt.Errorf("flatten: %w", flattenErr) } if err := promoteSnapshotDir(stageDir, dstDir); err != nil { return 0, fmt.Errorf("promote rootfs: %w", err) } size, err := snapshot.DirSize(dstDir, "") if err != nil { slog.Warn("flatten size calc failed", "id", sandboxID, "error", err) } return size, nil } // pauseAllConcurrency caps how many sandboxes PauseAll snapshots in // parallel. Each Pause writes guest RAM to disk and contends on host I/O // bandwidth, so unbounded parallelism would thrash. 8 keeps a busy host // from sequential 30s tails without saturating disk on smaller hosts. const pauseAllConcurrency = 8 // PauseAll pauses every running sandbox. Used by the host agent on graceful // shutdown so VMs can be resumed by the next agent instance. // // Runs Pauses concurrently with a bounded worker pool: per-sandbox Pause // blocks on the post-resume memory loader (up to 30s) plus ch.snapshot of // guest RAM (seconds-to-tens-of-seconds), so a serial loop would multiply // the shutdown budget by the running count. lifecycleMu is per-sandbox so // there is no cross-sandbox locking; m.mu is taken briefly for status flips. // // On each successful Pause, emits a sandbox.auto_paused event synchronously // so the CP can mark the DB row paused before the agent process exits. Sync // (not async) because Shutdown fires the process down right after — async // sends would race with exit. HostMonitor reconciles any event we fail to // deliver here, but emitting promptly avoids leaving sandboxes stuck as // 'running' in the DB until the next monitor tick or unreachable threshold. func (m *Manager) PauseAll(ctx context.Context) { m.mu.RLock() ids := make([]string, 0, len(m.boxes)) for id, sb := range m.boxes { if sb.Status == models.StatusRunning { ids = append(ids, id) } } m.mu.RUnlock() if len(ids) == 0 { return } sem := make(chan struct{}, pauseAllConcurrency) var wg sync.WaitGroup for _, sbID := range ids { wg.Add(1) sem <- struct{}{} go func(sbID string) { defer wg.Done() defer func() { <-sem }() if err := m.Pause(ctx, sbID); err != nil { slog.Warn("PauseAll: pause failed", "id", sbID, "error", err) return } if m.eventSender == nil { return } if err := m.eventSender.Send(ctx, LifecycleEvent{ Event: "sandbox.auto_paused", SandboxID: sbID, }); err != nil { slog.Warn("PauseAll: notify CP failed (reconciler will catch it)", "id", sbID, "error", err) } }(sbID) } wg.Wait() } // CleanupOrphanPauseDirs removes leftover *.staging-*, *.stage-*, and *.trash-* // dirs under sandboxes/ from any Pause/snapshot/flatten that crashed before // completing its swap or promote. Safe to call at agent startup before any // sandbox is created or restored. // // Per-sandbox cleanup happens implicitly during Destroy (which removes the // whole PauseSnapshotDir) — this function only handles agent-crash orphans. func CleanupOrphanPauseDirs(wrennDir string) { sandboxesDir := layout.SandboxesDir(wrennDir) entries, err := os.ReadDir(sandboxesDir) if err != nil { // Directory does not exist yet — nothing to clean. return } for _, e := range entries { if !e.IsDir() { continue } name := e.Name() // ".stage-" is the prefix used by snapshot/flatten staging dirs; // ".staging-" + ".trash-" are used by Pause's swap. (".stage-" is not a // substring of ".staging-", so all three need an explicit check.) if !strings.Contains(name, ".stage-") && !strings.Contains(name, ".staging-") && !strings.Contains(name, ".trash-") { continue } path := filepath.Join(sandboxesDir, name) if err := os.RemoveAll(path); err != nil { slog.Warn("orphan pause artifact remove failed", "path", path, "error", err) continue } slog.Info("removed orphan pause artifact", "path", path) } }