1
0
forked from wrenn/wrenn
Files
wrenn-releases/internal/sandbox/pause.go
Rafeed M. Bhuiyan 05ddf62399 v0.2.0 (#50)
Co-authored-by: Tasnim Kabir Sadik <tksadik@omukk.dev>

Reviewed-on: wrenn/wrenn#50
2026-05-24 21:10:37 +00:00

1181 lines
44 KiB
Go

// Package sandbox: pause / resume / live-snapshot orchestration.
//
// Two high-level operations both built on the same CH primitives. Names use
// wrenn.* vs ch.* so it is clear which layer a step belongs to.
//
// wrenn.snapshot = ch.pause + ch.snapshot + ch.resume
// artefacts -> WRENN_DIR/images/teams/{teamID}/{templateID}/
// sandbox keeps running; dm-snapshot also flattened into
// rootfs.ext4 so the dir is a self-contained template.
//
// wrenn.pause = ch.pause + ch.snapshot + ch.destroy
// artefacts -> WRENN_DIR/sandboxes/{sandboxID}/
// VM torn down; CoW file at WRENN_DIR/sandboxes/{id}/rootfs.cow
// + network slot retained so resume reaches the same host-IP.
//
// Pause always writes to a fresh staging directory and atomically swaps it
// into place after ch.destroy releases CH's open fd to the previous
// generation's memory-ranges (held via userfaultfd for lazy memory restore).
// This is what makes pause-resume-pause-resume chains correct: an in-place
// rewrite would risk CH reading from the file we are simultaneously
// overwriting.
//
// CH 52+ writes memory-ranges as a sparse file via SEEK_DATA/SEEK_HOLE,
// combined with `thp:false` + `free_page_reporting:true` on the balloon and
// a pre-pause balloon inflation to reclaim guest free pages — no userspace
// hole punching needed.
package sandbox
import (
"context"
"encoding/json"
"fmt"
"log/slog"
"os"
"os/exec"
"path/filepath"
"strconv"
"strings"
"sync"
"time"
"github.com/jackc/pgx/v5/pgtype"
"git.omukk.dev/wrenn/wrenn/internal/devicemapper"
"git.omukk.dev/wrenn/wrenn/internal/layout"
"git.omukk.dev/wrenn/wrenn/internal/models"
"git.omukk.dev/wrenn/wrenn/internal/network"
"git.omukk.dev/wrenn/wrenn/internal/snapshot"
"git.omukk.dev/wrenn/wrenn/internal/vm"
"git.omukk.dev/wrenn/wrenn/pkg/id"
)
const (
// snapshotMetaFile is the per-snapshot metadata file holding the info
// needed to restore the sandbox (template, resources, slot, etc.).
snapshotMetaFile = "wrenn-snapshot.json"
// drainTimeout is how long pause waits for in-flight proxy connections
// to release before forcibly cancelling them.
drainTimeout = 5 * time.Second
// prepareSnapshotTimeout bounds the in-guest /snapshot/prepare call.
// Short on purpose: envd PrepareSnapshot is best-effort, and a wedged
// guest must not block the host-side pause path.
prepareSnapshotTimeout = 5 * time.Second
// vmInfoProbeTimeout bounds the CH /vm.info liveness probe issued
// before destructive CH ops (pause/snapshot). Local unix-socket call —
// kept tight so a dead socket fails fast.
vmInfoProbeTimeout = 3 * time.Second
// vmPauseTimeout bounds ch.pause. Pause itself is fast; the deadline
// guards against a wedged CH unix socket hanging the request.
vmPauseTimeout = 30 * time.Second
)
// snapshotMeta is persisted into every snapshot directory. It captures the
// minimum information needed to restore the sandbox or build a new sandbox
// from a template, independent of the in-memory state in m.boxes.
type snapshotMeta struct {
// TemplateName is the human-readable template name. Set for snapshot
// templates (CreateSnapshot); empty for pause snapshots.
TemplateName string `json:"template_name,omitempty"`
TeamID string `json:"team_id"`
TemplateID string `json:"template_id"`
VCPUs int `json:"vcpus"`
MemoryMB int `json:"memory_mb"`
TimeoutSec int `json:"timeout_sec"`
// SlotIndex is the retained network slot. Only meaningful for pause
// snapshots — resume re-acquires the same slot so the host-IP is stable.
// Omitted for snapshot templates, which allocate a fresh slot per launch.
SlotIndex int `json:"slot_index,omitempty"`
BaseTemplate string `json:"base_template"`
CowPath string `json:"cow_path,omitempty"`
// SandboxDir pins the CH SandboxDir on restore — the tmpfs path baked
// into CH's saved config.json. Always set: a restored sandbox gets a
// fresh ID, but config.json keeps the tmpfs path of the sandbox the
// snapshot was taken from, so the launcher must reconstruct it exactly.
// For a snapshot-of-a-snapshot this is the root ancestor's path, carried
// forward verbatim through the chain.
SandboxDir string `json:"sandbox_dir"`
CreatedAt time.Time `json:"created_at"`
}
// effectiveSandboxDir returns the tmpfs SandboxDir the running VM uses — the
// path baked into CH's config.json. A fresh-boot sandbox derives it from its
// own ID; a sandbox launched from a snapshot template inherits the override.
func effectiveSandboxDir(sb *sandboxState) string {
if sb.sandboxDirOverride != "" {
return sb.sandboxDirOverride
}
return vm.SandboxTmpDir(sb.ID)
}
func writeSnapshotMeta(dir string, m *snapshotMeta) error {
data, err := json.MarshalIndent(m, "", " ")
if err != nil {
return fmt.Errorf("marshal snapshot meta: %w", err)
}
if err := os.WriteFile(filepath.Join(dir, snapshotMetaFile), data, 0o644); err != nil {
return fmt.Errorf("write snapshot meta: %w", err)
}
return nil
}
func readSnapshotMeta(dir string) (*snapshotMeta, error) {
data, err := os.ReadFile(filepath.Join(dir, snapshotMetaFile))
if err != nil {
return nil, fmt.Errorf("read snapshot meta: %w", err)
}
var meta snapshotMeta
if err := json.Unmarshal(data, &meta); err != nil {
return nil, fmt.Errorf("unmarshal snapshot meta: %w", err)
}
return &meta, nil
}
// Pause freezes the VM, persists the snapshot to WRENN_DIR/sandboxes/{id}/,
// and tears down VM/network/dm resources. The CoW file is kept on disk so
// Resume can pick up where the sandbox left off.
//
// The sandbox stays in m.boxes with Status=Paused. The cow file at
// WRENN_DIR/sandboxes/{id}/rootfs.cow persists; on Resume it is re-attached
// via devicemapper.RestoreSnapshot.
//
// Write strategy: snapshot is written into a fresh staging directory, the
// VM is destroyed (closing CH's open fd to any previous-generation
// memory-ranges), then the staging directory atomically replaces the
// previous one via rename. This is essential for pause-resume-pause chains
// where CH holds the old memory-ranges open via userfaultfd while we write
// the new one.
func (m *Manager) Pause(ctx context.Context, sandboxID string) error {
sb, err := m.get(sandboxID)
if err != nil {
return err
}
sb.lifecycleMu.Lock()
defer sb.lifecycleMu.Unlock()
if sb.Status == models.StatusPaused {
return nil
}
if sb.Status != models.StatusRunning {
return fmt.Errorf("%w: %s (status: %s)", ErrNotRunning, sandboxID, sb.Status)
}
// Wait for the post-resume memory loader to finish before snapshotting.
// Without this, ch.snapshot's SEEK_DATA/SEEK_HOLE writer would emit holes
// for any page not yet faulted in, which read back as zero on the next
// restore — silent corruption across pause/resume chains.
if err := m.waitForMemoryLoader(ctx, sb); err != nil {
return fmt.Errorf("pause %s: %w", sandboxID, err)
}
m.mu.Lock()
sb.Status = models.StatusPausing
m.mu.Unlock()
finalDir := layout.PauseSnapshotDir(m.cfg.WrennDir, sandboxID)
stageDir := layout.PauseStagingDir(m.cfg.WrennDir, sandboxID)
rollbackToRunning := func(cause error, stage string) error {
_ = os.RemoveAll(stageDir)
// If the VM can't be unfrozen the sandbox is no longer usable.
// Mark it Error so subsequent RPCs don't operate on a broken VM
// (especially after a partial vm.snapshot which can leave CH wedged).
if rerr := m.vm.Resume(context.Background(), sandboxID); rerr != nil {
m.mu.Lock()
sb.Status = models.StatusError
m.mu.Unlock()
sb.connTracker.Reset()
return fmt.Errorf("pause %s: %s: %w (and resume failed: %v)",
sandboxID, stage, cause, rerr)
}
sb.connTracker.Reset()
m.mu.Lock()
sb.Status = models.StatusRunning
m.mu.Unlock()
return fmt.Errorf("pause %s: %s: %w", sandboxID, stage, cause)
}
if err := m.quiesceAndPauseCH(ctx, sb); err != nil {
return rollbackToRunning(err, "quiesce")
}
// Memory materialisation is handled out-of-band by the background loader
// kicked off by Resume after /init. We blocked on it above (waitForMemoryLoader)
// so by the time we reach ch.snapshot every guest page is resident in CH's
// memfile and SEEK_DATA/SEEK_HOLE produces a self-contained snapshot.
if err := os.MkdirAll(stageDir, 0o755); err != nil {
return rollbackToRunning(err, "mkdir staging")
}
if err := m.vm.Snapshot(ctx, sandboxID, stageDir); err != nil {
return rollbackToRunning(err, "snapshot")
}
// Punch zero pages CH wrote verbatim (guest had them dirty-then-free
// without notifying the balloon driver). Best-effort; failures only
// cost disk space.
punchZeroPagesInDir(stageDir)
meta := &snapshotMeta{
TeamID: id.UUIDString(pgtype.UUID{Bytes: sb.TemplateTeamID, Valid: true}),
TemplateID: id.UUIDString(pgtype.UUID{Bytes: sb.TemplateID, Valid: true}),
VCPUs: sb.VCPUs,
MemoryMB: sb.MemoryMB,
TimeoutSec: sb.TimeoutSec,
SlotIndex: sb.SlotIndex,
BaseTemplate: sb.baseImagePath,
CowPath: sb.dmDevice.CowPath,
SandboxDir: effectiveSandboxDir(sb),
CreatedAt: time.Now(),
}
if err := writeSnapshotMeta(stageDir, meta); err != nil {
// Without meta, Resume cannot reconstruct the sandbox. Treat as fatal.
_ = os.RemoveAll(stageDir)
return rollbackToRunning(err, "write meta")
}
// releaseRuntime destroys the VM, which closes CH's open fd to any
// previous-generation memory-ranges. Must happen BEFORE we touch finalDir
// so the swap is safe. It also tears down the dm-snapshot so the CoW file
// inside finalDir is no longer held open and can be moved.
m.releaseRuntime(sb, keepCow)
// CoW lives at finalDir/rootfs.cow. swapDir replaces finalDir wholesale,
// which would discard it. Move it into stageDir first so the swap carries
// the CoW through alongside the new snapshot files.
cowFinal := layout.SandboxCowPath(m.cfg.WrennDir, sandboxID)
cowStage := filepath.Join(stageDir, layout.SandboxCowName)
if err := os.Rename(cowFinal, cowStage); err != nil && !os.IsNotExist(err) {
m.mu.Lock()
sb.Status = models.StatusError
m.mu.Unlock()
return fmt.Errorf("pause %s: stage cow: %w", sandboxID, err)
}
if err := swapDir(stageDir, finalDir); err != nil {
// CH is already destroyed — we cannot roll back to Running. The
// staging snapshot is still on disk for forensic recovery.
m.mu.Lock()
sb.Status = models.StatusError
m.mu.Unlock()
return fmt.Errorf("pause %s: swap snapshot dir: %w", sandboxID, err)
}
m.mu.Lock()
sb.Status = models.StatusPaused
m.mu.Unlock()
slog.Info("sandbox paused", "id", sandboxID, "snapshot_dir", finalDir)
return nil
}
// swapDir atomically replaces final with stage. Any existing final dir is
// moved aside to a uniquely-named trash dir before the swap so the rename
// can succeed, then the trash is removed.
//
// Failure modes:
// - move-old-to-trash fails: previous final dir is intact. stage remains.
// - stage-to-final fails: we attempt to restore old from trash. If that
// fails, the sandbox is wedged but stage still holds valid data.
// - trash removal fails: previous generation is orphaned, will be GC'd
// on next agent startup.
func swapDir(stage, final string) error {
trash := final + ".trash-" + strconv.FormatInt(time.Now().UnixNano(), 10)
hadOld := true
if _, err := os.Stat(final); err != nil {
if !os.IsNotExist(err) {
return fmt.Errorf("stat existing final dir: %w", err)
}
hadOld = false
}
if hadOld {
if err := os.Rename(final, trash); err != nil {
return fmt.Errorf("move old final to trash: %w", err)
}
}
if err := os.Rename(stage, final); err != nil {
// Try to put the old one back.
if hadOld {
if rerr := os.Rename(trash, final); rerr != nil {
slog.Warn("could not restore previous snapshot dir after failed swap",
"trash", trash, "final", final, "error", rerr)
}
}
return fmt.Errorf("move stage to final: %w", err)
}
if hadOld {
if err := os.RemoveAll(trash); err != nil {
slog.Warn("could not remove trashed snapshot dir",
"path", trash, "error", err)
}
}
return nil
}
// quiesceAndPauseCH drains envd connections, asks envd to quiesce its own
// state, then issues ch.pause. On return the VM is frozen and ready for
// ch.snapshot. Caller must either ch.resume or ch.destroy afterwards.
//
// Snapshot-size optimisation relies on virtio-balloon's free_page_reporting:
// envd drops the VFS page cache + fstrim + a settle window inside
// /snapshot/prepare, which gives the guest balloon driver time to report all
// the now-free pages to the host. CH punches those reports out of the backing
// memfile and v52+'s SEEK_DATA/SEEK_HOLE snapshot writer skips them. No
// explicit balloon inflate is required — inflation would constrain the guest
// post-resume (forced re-allocation of large free regions), and free_page_
// reporting drains everything we'd have inflated anyway.
func (m *Manager) quiesceAndPauseCH(ctx context.Context, sb *sandboxState) error {
sb.connTracker.Drain(drainTimeout)
sb.connTracker.ForceClose()
if c := sb.client.Load(); c != nil {
// Bound the in-guest prepare call. If envd is wedged or the netns
// is half-torn-down the connect/read can block for the full envd
// client timeout (2m), which the user perceives as a hung snapshot.
prepCtx, prepCancel := context.WithTimeout(ctx, prepareSnapshotTimeout)
err := c.PrepareSnapshot(prepCtx)
prepCancel()
if err != nil {
slog.Warn("envd prepare-snapshot failed (continuing)", "id", sb.ID, "error", err)
}
c.CloseIdleConnections()
}
// Verify CH is still alive before issuing destructive ops. Without this
// a second snapshot attempt against a sandbox whose CH process died
// would block on vm.pause until the unix-socket dial times out.
probeCtx, probeCancel := context.WithTimeout(ctx, vmInfoProbeTimeout)
state, err := m.vm.Info(probeCtx, sb.ID)
probeCancel()
if err != nil {
return fmt.Errorf("ch.vm.info probe: %w", err)
}
if state != "Running" {
return fmt.Errorf("ch.vm.info: VM in state %q, not Running", state)
}
pauseCtx, pauseCancel := context.WithTimeout(ctx, vmPauseTimeout)
defer pauseCancel()
if err := m.vm.Pause(pauseCtx, sb.ID); err != nil {
return fmt.Errorf("ch.pause: %w", err)
}
return nil
}
// promoteSnapshotDir moves every regular file from srcDir into dstDir using
// rename(2). Renames are per-file so an existing rootfs.ext4 inside dstDir
// that is currently held open by a loop device keeps its inode (the directory
// entry is replaced, but the open fd still references the old inode). srcDir
// is removed on success.
func promoteSnapshotDir(srcDir, dstDir string) error {
if err := os.MkdirAll(dstDir, 0o755); err != nil {
return fmt.Errorf("mkdir dst: %w", err)
}
entries, err := os.ReadDir(srcDir)
if err != nil {
return fmt.Errorf("read staging: %w", err)
}
for _, e := range entries {
from := filepath.Join(srcDir, e.Name())
to := filepath.Join(dstDir, e.Name())
if err := os.Rename(from, to); err != nil {
return fmt.Errorf("rename %s: %w", e.Name(), err)
}
}
return os.RemoveAll(srcDir)
}
// releaseRuntime tears down VM, network, dm-snapshot, and loop refcount for
// a paused sandbox. The CoW file is preserved when keep == keepCow so Resume
// can re-attach it.
type cowDisposition int
const (
keepCow cowDisposition = iota
dropCow
)
func (m *Manager) releaseRuntime(sb *sandboxState, cow cowDisposition) {
// Cancel any background memory loader (UFFD page faulter) before
// destroying the VM. Without this, the loader keeps trying to fault
// pages into a vanished guest and races with sb.client being cleared
// below. Mirror the cleanup() pattern.
if sb.memLoadCancel != nil {
sb.memLoadCancel()
if sb.memLoadDone != nil {
<-sb.memLoadDone
}
}
m.stopSampler(sb)
if err := m.vm.Destroy(context.Background(), sb.ID); err != nil {
slog.Warn("vm destroy on pause", "id", sb.ID, "error", err)
}
if err := network.RemoveNetwork(sb.slot); err != nil {
slog.Warn("network remove on pause", "id", sb.ID, "error", err)
}
// Retain the slot when keeping the CoW (pause): Resume must re-acquire
// the same SlotIndex so the sandbox's host-IP stays stable. Releasing
// here lets a subsequent Create steal slot 1 while we're paused, and
// Resume's slots.Reserve() then fails with "slot already in use".
if cow == dropCow {
m.slots.Release(sb.SlotIndex)
}
if sb.dmDevice != nil {
if err := devicemapper.RemoveSnapshot(context.Background(), sb.dmDevice); err != nil {
slog.Warn("dm-snapshot remove on pause", "id", sb.ID, "error", err)
}
if cow == dropCow {
os.Remove(sb.dmDevice.CowPath)
}
}
if sb.baseImagePath != "" {
m.loops.Release(sb.baseImagePath)
}
// Clear runtime references; they're rebuilt on resume.
sb.slot = nil
sb.client.Store(nil)
sb.dmDevice = nil
}
// Resume re-launches a paused sandbox from its on-disk snapshot. The same
// SlotIndex is reserved so the sandbox keeps its host-IP. The dm-snapshot
// is re-attached to the existing CoW file, then CH is launched with
// --restore. Memory faults in lazily via userfaultfd.
//
// The snapshot directory is NOT deleted after a successful resume: CH keeps
// an open fd to memory-ranges for lazy page faulting throughout the VM's
// lifetime. The next Pause writes to a fresh staging dir and swaps; only
// then is the previous generation discarded.
//
// The remaining args (defaultUser, env, etc.) are forwarded to envd's /init
// so the resumed sandbox sees the same execution environment as before.
func (m *Manager) Resume(ctx context.Context, sandboxID string, timeoutSec int, defaultUser, _ string, envVars map[string]string) (*models.Sandbox, error) {
if m.draining.Load() {
return nil, ErrDraining
}
sb, err := m.get(sandboxID)
if err != nil {
return nil, err
}
sb.lifecycleMu.Lock()
defer sb.lifecycleMu.Unlock()
if sb.Status == models.StatusRunning {
return &sb.Sandbox, nil
}
if sb.Status != models.StatusPaused {
return nil, fmt.Errorf("%w: %s (status: %s)", ErrNotPaused, sandboxID, sb.Status)
}
snapDir := layout.PauseSnapshotDir(m.cfg.WrennDir, sandboxID)
meta, err := readSnapshotMeta(snapDir)
if err != nil {
return nil, fmt.Errorf("load snapshot meta: %w", err)
}
resumed, err := m.resumeFromMeta(ctx, sb, meta, snapDir)
if err != nil {
// resumeFromMeta rolled back its own runtime resources. Leave the
// sandbox in Paused state so the caller can retry — the on-disk
// snapshot and slot reservation are intact. Evicting from m.boxes
// would orphan a recoverable sandbox: DB still says paused but the
// agent would return NotFound on retry.
m.mu.Lock()
sb.Status = models.StatusPaused
m.mu.Unlock()
return nil, err
}
// Single /init then start the memory loader. See initAndStartMemoryLoader
// for the ordering rationale (init resets envd atomics that the loader
// then re-arms — reversing the order silently corrupts the next snapshot).
m.initAndStartMemoryLoader(ctx, resumed, defaultUser,
id.UUIDString(pgtype.UUID{Bytes: sb.TemplateID, Valid: true}), envVars)
if timeoutSec > 0 {
m.mu.Lock()
sb.TimeoutSec = clampTimeout(timeoutSec)
m.mu.Unlock()
}
return &sb.Sandbox, nil
}
// resumeFromMeta wires up the runtime resources (loop, dm-snapshot, network,
// CH process) for a paused sandbox and waits until envd is ready.
//
// On any failure the partial setup is rolled back so the sandbox stays in
// a clean Paused state.
func (m *Manager) resumeFromMeta(ctx context.Context, sb *sandboxState, meta *snapshotMeta, snapDir string) (*sandboxState, error) {
// 1. Re-acquire the shared loop device for the base template.
originLoop, err := m.loops.Acquire(meta.BaseTemplate)
if err != nil {
return nil, fmt.Errorf("acquire loop: %w", err)
}
originSize, err := devicemapper.OriginSizeBytes(originLoop)
if err != nil {
m.loops.Release(meta.BaseTemplate)
return nil, fmt.Errorf("origin size: %w", err)
}
// 2. Re-attach the dm-snapshot using the persistent CoW file.
dmName := "wrenn-" + sb.ID
dmDev, err := devicemapper.RestoreSnapshot(ctx, dmName, originLoop, meta.CowPath, originSize)
if err != nil {
m.loops.Release(meta.BaseTemplate)
return nil, fmt.Errorf("restore dm-snapshot: %w", err)
}
// 3. Slot is already held continuously from Create through Pause —
// the allocator never released it on Pause, so the SlotIndex from meta
// is still reserved for this sandbox. Just rebuild the Slot struct.
slot := network.NewSlot(meta.SlotIndex)
if err := network.CreateNetwork(slot); err != nil {
if rmErr := devicemapper.RemoveSnapshot(context.Background(), dmDev); rmErr != nil {
slog.Warn("dm remove during resume rollback", "id", sb.ID, "error", rmErr)
}
m.loops.Release(meta.BaseTemplate)
return nil, fmt.Errorf("create network: %w", err)
}
rollback := func() {
warnErr("network remove during resume rollback", sb.ID, network.RemoveNetwork(slot))
// Slot stays reserved across pause/resume — released only on Destroy.
warnErr("dm remove during resume rollback", sb.ID, devicemapper.RemoveSnapshot(context.Background(), dmDev))
m.loops.Release(meta.BaseTemplate)
}
// 4-6. Launch CH in restore mode, wait envd, deflate balloon. Sandbox
// keeps its original ID/SandboxDir so the disk path baked into
// config.json (`/tmp/ch-vm-{originalID}/rootfs.ext4`) resolves to the
// re-attached dm device via the tmpfs symlink set up by the launcher.
vmCfg := m.buildRestoreVMConfig(restoreInputs{
sandboxID: sb.ID,
templateID: id.UUIDString(pgtype.UUID{Bytes: sb.TemplateID, Valid: true}),
snapDir: snapDir,
rootfsPath: dmDev.DevicePath,
vcpus: meta.VCPUs,
memoryMB: meta.MemoryMB,
slot: slot,
sandboxDir: meta.SandboxDir,
})
client, err := m.launchRestoredVM(ctx, vmCfg, slot.HostIP.String())
if err != nil {
rollback()
return nil, err
}
// /init is invoked once by the outer Resume so a single lifecycle bump
// reaches envd. (Calling it here too would double-restart port forwarder.)
// 7. Re-hydrate in-memory state.
m.mu.Lock()
sb.slot = slot
sb.client.Store(client)
sb.dmDevice = dmDev
sb.sandboxDirOverride = meta.SandboxDir
// baseImagePath pairs the loop refcount we just Acquire'd with the
// matching Release inside cleanup() / releaseRuntime(). For a sandbox
// rehydrated from RestorePausedSandboxes this is the first time
// baseImagePath is populated — the restored entry intentionally leaves
// it empty so a Destroy-before-Resume cannot underflow the registry.
sb.baseImagePath = meta.BaseTemplate
sb.connTracker.Reset()
sb.HostIP = slot.HostIP
sb.RootfsPath = dmDev.DevicePath
sb.LastActiveAt = time.Now()
sb.Status = models.StatusRunning
m.mu.Unlock()
m.startSampler(sb)
m.startCrashWatcher(sb)
// Background memory loader is started by the outer Resume AFTER /init
// completes — see comment there for the race rationale.
slog.Info("sandbox resumed", "id", sb.ID, "host_ip", slot.HostIP.String())
return sb, nil
}
// startMemoryLoader spawns the background goroutine that asks envd to read
// every guest physical page so subsequent snapshots are self-contained. The
// goroutine is cancellable via sb.memLoadCancel and closes sb.memLoadDone on
// exit. Must be called with sb in StatusRunning and sb.client populated.
func (m *Manager) startMemoryLoader(sb *sandboxState) {
loadCtx, cancel := context.WithCancel(context.Background())
done := make(chan struct{})
m.mu.Lock()
sb.memLoadCancel = cancel
sb.memLoadDone = done
m.mu.Unlock()
go func() {
defer close(done)
client := sb.client.Load()
if client == nil {
return
}
started := time.Now()
// Kick the loader off in envd. The POST returns as soon as the
// background thread is queued — actual materialisation continues
// inside envd independent of this connection.
startCtx, startCancel := context.WithTimeout(loadCtx, 30*time.Second)
if _, err := client.StartMemoryPreload(startCtx); err != nil {
startCancel()
if loadCtx.Err() != nil {
slog.Debug("memory preload start cancelled", "id", sb.ID)
return
}
slog.Warn("memory preload start failed", "id", sb.ID, "error", err)
return
}
startCancel()
// Poll envd for completion. Polling interval is coarse (1s) since the
// loader runs for many seconds; the polls just check an atomic.
status, err := client.WaitMemoryPreload(loadCtx)
if err != nil {
if loadCtx.Err() != nil {
slog.Debug("memory preload wait cancelled", "id", sb.ID)
return
}
slog.Warn("memory preload wait failed", "id", sb.ID, "error", err)
return
}
if status.State != "done" {
slog.Warn("memory preload finished abnormally",
"id", sb.ID,
"state", status.State,
"error", status.Error,
"pages", status.Pages,
"bytes", status.Bytes,
"source", status.Source,
)
return
}
slog.Info("memory preload complete",
"id", sb.ID,
"elapsed", time.Since(started),
"pages", status.Pages,
"bytes", status.Bytes,
"source", status.Source,
)
}()
}
// waitForMemoryLoader blocks until the background memory loader finishes, or
// until ctx is cancelled. Returns nil if the loader is already done or not
// running. A pause must wait on this before ch.snapshot so the resulting
// memory-ranges is self-contained.
func (m *Manager) waitForMemoryLoader(ctx context.Context, sb *sandboxState) error {
m.mu.RLock()
done := sb.memLoadDone
m.mu.RUnlock()
if done == nil {
return nil
}
select {
case <-done:
return nil
case <-ctx.Done():
return fmt.Errorf("wait for memory loader: %w", ctx.Err())
}
}
// CreateSnapshot writes a self-contained template snapshot to
// WRENN_DIR/images/teams/{teamID}/{templateID}/, then returns the total size
// (in bytes) of the artefacts written.
//
// A running sandbox is snapshotted live (briefly paused, memory dumped, rootfs
// flattened, then resumed). A paused sandbox is snapshotted straight from its
// on-disk pause artefacts without reviving the VM — it stays paused.
func (m *Manager) CreateSnapshot(ctx context.Context, sandboxID string, teamID, templateID pgtype.UUID, name string) (int64, error) {
sb, err := m.get(sandboxID)
if err != nil {
return 0, err
}
sb.lifecycleMu.Lock()
defer sb.lifecycleMu.Unlock()
// Refuse silent overwrites: every snapshot must land in a fresh
// templateID. Defends against caller bugs and concurrent CreateSnapshot
// races for the same destination. User-facing snapshot-name uniqueness
// is also enforced by the CP at the templates table.
if m.templateExists(teamID, templateID) {
return 0, fmt.Errorf("snapshot template %s/%s already exists",
id.UUIDString(teamID), id.UUIDString(templateID))
}
switch sb.Status {
case models.StatusRunning:
return m.snapshotRunningToTemplate(ctx, sb, teamID, templateID, name)
case models.StatusPaused:
return m.snapshotPausedToTemplate(ctx, sb, teamID, templateID, name)
default:
return 0, fmt.Errorf("%w: %s (status: %s)", ErrNotRunning, sandboxID, sb.Status)
}
}
// snapshotRunningToTemplate takes a live snapshot of a running sandbox: pause
// CH, dump memory + flatten the rootfs into a staging dir, resume CH, then
// promote the staged template into place. The sandbox returns to running.
func (m *Manager) snapshotRunningToTemplate(ctx context.Context, sb *sandboxState, teamID, templateID pgtype.UUID, name string) (int64, error) {
sandboxID := sb.ID
// Same rationale as Pause: wait for the background memory loader so the
// resulting memory-ranges is self-contained when this sandbox itself was
// previously restored from an ondemand snapshot.
if err := m.waitForMemoryLoader(ctx, sb); err != nil {
return 0, fmt.Errorf("create snapshot %s: %w", sandboxID, err)
}
dstDir := layout.TemplateDir(m.cfg.WrennDir, teamID, templateID)
stageDir := filepath.Join(layout.SandboxesDir(m.cfg.WrennDir),
fmt.Sprintf(".stage-%s-%d", sandboxID, time.Now().UnixNano()))
if err := os.MkdirAll(stageDir, 0o755); err != nil {
return 0, fmt.Errorf("mkdir stage dir: %w", err)
}
defer os.RemoveAll(stageDir)
// Quiesce + ch.pause + ch.snapshot into a staging dir. The final dst
// may contain the sandbox's own base rootfs.ext4 held open via the loop
// device; writing through a staging dir + per-file rename avoids
// unlinking that inode while the loop still references it.
if err := m.quiesceAndPauseCH(ctx, sb); err != nil {
_ = m.vm.Resume(context.Background(), sandboxID)
sb.connTracker.Reset()
return 0, err
}
if err := m.vm.Snapshot(ctx, sandboxID, stageDir); err != nil {
_ = m.vm.Resume(context.Background(), sandboxID)
sb.connTracker.Reset()
return 0, fmt.Errorf("vm.snapshot: %w", err)
}
punchZeroPagesInDir(stageDir)
// Flatten dm-snapshot → rootfs.ext4. Reads through the dm device which is
// stable while CH is paused.
rootfsOut := filepath.Join(stageDir, "rootfs.ext4")
if err := devicemapper.FlattenSnapshot(sb.dmDevice.DevicePath, rootfsOut); err != nil {
// Resume so the sandbox doesn't get stuck. Caller sees the error.
if rerr := m.vm.Resume(context.Background(), sandboxID); rerr != nil {
slog.Warn("vm resume after flatten failure", "id", sandboxID, "error", rerr)
}
sb.connTracker.Reset()
return 0, fmt.Errorf("flatten rootfs: %w", err)
}
// SlotIndex is intentionally omitted: a snapshot template allocates a
// fresh network slot on every launch, so the source sandbox's slot is
// meaningless. SandboxDir, however, must be recorded — see snapshotMeta.
meta := &snapshotMeta{
TemplateName: name,
TeamID: id.UUIDString(teamID),
TemplateID: id.UUIDString(templateID),
VCPUs: sb.VCPUs,
MemoryMB: sb.MemoryMB,
TimeoutSec: sb.TimeoutSec,
BaseTemplate: sb.baseImagePath,
SandboxDir: effectiveSandboxDir(sb),
CreatedAt: time.Now(),
}
if err := writeSnapshotMeta(stageDir, meta); err != nil {
slog.Warn("template meta write failed", "id", sandboxID, "error", err)
}
// Resume the live sandbox; the staged snapshot is fully written.
// On resume failure we still Reset the connTracker: leaving it draining
// would refuse all subsequent proxy connections even though the VM is
// effectively running (just wedged on the CH side). The error returned
// to the caller surfaces the wedge state.
if err := m.vm.Resume(ctx, sandboxID); err != nil {
sb.connTracker.Reset()
return 0, fmt.Errorf("vm resume after live snapshot: %w", err)
}
sb.connTracker.Reset()
// Promote staging → final destination via per-file rename.
if err := promoteSnapshotDir(stageDir, dstDir); err != nil {
return 0, fmt.Errorf("promote snapshot: %w", err)
}
// Tell envd to refresh its clock and lifecycle. Brief pause means clock
// drift is usually <1s but PostInit is cheap.
if c := sb.client.Load(); c != nil {
if err := c.PostInit(ctx); err != nil {
slog.Warn("envd PostInit after live snapshot", "id", sandboxID, "error", err)
}
}
size, err := snapshot.DirSize(dstDir, "")
if err != nil {
slog.Warn("snapshot size calc failed", "id", sandboxID, "error", err)
}
slog.Info("live snapshot created",
"id", sandboxID,
"team_id", teamID,
"template_id", templateID,
"dir", dstDir,
"bytes", size,
)
return size, nil
}
// snapshotPausedToTemplate builds a self-contained template from a paused
// sandbox's on-disk artefacts without reviving the VM. The pause snapshot
// already holds a self-contained CH memory image (Pause blocks on the memory
// loader before snapshotting), so we copy those memory files verbatim and
// flatten the persistent CoW into rootfs.ext4. The sandbox stays Paused.
func (m *Manager) snapshotPausedToTemplate(ctx context.Context, sb *sandboxState, teamID, templateID pgtype.UUID, name string) (int64, error) {
snapDir := layout.PauseSnapshotDir(m.cfg.WrennDir, sb.ID)
meta, err := readSnapshotMeta(snapDir)
if err != nil {
return 0, fmt.Errorf("load pause snapshot meta: %w", err)
}
dstDir := layout.TemplateDir(m.cfg.WrennDir, teamID, templateID)
stageDir := filepath.Join(layout.SandboxesDir(m.cfg.WrennDir),
fmt.Sprintf(".stage-%s-%d", sb.ID, time.Now().UnixNano()))
if err := os.MkdirAll(stageDir, 0o755); err != nil {
return 0, fmt.Errorf("mkdir stage dir: %w", err)
}
defer os.RemoveAll(stageDir)
// Flatten the persistent CoW into a standalone rootfs.ext4. The VM is down,
// so re-attach a throwaway dm-snapshot over the base image + CoW just long
// enough to read through it; the CoW file is left intact for a later Resume.
if err := m.flattenPausedCow(ctx, sb.ID, meta, filepath.Join(stageDir, "rootfs.ext4")); err != nil {
return 0, err
}
// Copy CH's memory snapshot files verbatim (state.json, config.json,
// memory-ranges, …) — everything except the CoW and the pause meta, which
// the template replaces with its own rootfs.ext4 and meta below.
if err := copyMemorySnapshotFiles(snapDir, stageDir); err != nil {
return 0, err
}
// Template meta: no SlotIndex (a template allocates a fresh slot per launch);
// SandboxDir + BaseTemplate carried forward so the restore path resolves the
// tmpfs disk path baked into CH's config.json.
tmplMeta := &snapshotMeta{
TemplateName: name,
TeamID: id.UUIDString(teamID),
TemplateID: id.UUIDString(templateID),
VCPUs: meta.VCPUs,
MemoryMB: meta.MemoryMB,
TimeoutSec: meta.TimeoutSec,
BaseTemplate: meta.BaseTemplate,
SandboxDir: meta.SandboxDir,
CreatedAt: time.Now(),
}
if err := writeSnapshotMeta(stageDir, tmplMeta); err != nil {
slog.Warn("template meta write failed", "id", sb.ID, "error", err)
}
if err := promoteSnapshotDir(stageDir, dstDir); err != nil {
return 0, fmt.Errorf("promote snapshot: %w", err)
}
size, err := snapshot.DirSize(dstDir, "")
if err != nil {
slog.Warn("snapshot size calc failed", "id", sb.ID, "error", err)
}
slog.Info("paused snapshot created",
"id", sb.ID,
"team_id", teamID,
"template_id", templateID,
"dir", dstDir,
"bytes", size,
)
return size, nil
}
// flattenPausedCow re-attaches a temporary dm-snapshot over a paused sandbox's
// base image + persistent CoW, flattens it into outPath, then tears the dm
// device down. The CoW file is preserved (RemoveSnapshot never deletes it) so a
// later Resume still works. A distinct dm name avoids colliding with the
// "wrenn-{id}" device a concurrent Resume would create — though lifecycleMu
// already serialises the two.
func (m *Manager) flattenPausedCow(ctx context.Context, sandboxID string, meta *snapshotMeta, outPath string) error {
originLoop, err := m.loops.Acquire(meta.BaseTemplate)
if err != nil {
return fmt.Errorf("acquire loop: %w", err)
}
defer m.loops.Release(meta.BaseTemplate)
originSize, err := devicemapper.OriginSizeBytes(originLoop)
if err != nil {
return fmt.Errorf("origin size: %w", err)
}
dmDev, err := devicemapper.RestoreSnapshot(ctx, "wrenn-flat-"+sandboxID, originLoop, meta.CowPath, originSize)
if err != nil {
return fmt.Errorf("restore dm-snapshot: %w", err)
}
defer func() {
if rerr := devicemapper.RemoveSnapshot(context.Background(), dmDev); rerr != nil {
slog.Warn("dm remove after paused flatten", "id", sandboxID, "error", rerr)
}
}()
if err := devicemapper.FlattenSnapshot(dmDev.DevicePath, outPath); err != nil {
return fmt.Errorf("flatten rootfs: %w", err)
}
return nil
}
// copyMemorySnapshotFiles copies every regular file from a pause snapshot dir
// into dstDir except the CoW and the wrenn meta — i.e. CH's own memory snapshot
// artefacts (state.json, config.json, memory-ranges, …). It hardlinks when the
// dirs share a filesystem (instant, preserves sparseness) and falls back to a
// sparse-preserving copy across filesystems. Pause never mutates these files in
// place — the next Pause writes a fresh dir and swaps — so a hardlink stays a
// valid, immutable view for the template.
func copyMemorySnapshotFiles(srcDir, dstDir string) error {
entries, err := os.ReadDir(srcDir)
if err != nil {
return fmt.Errorf("read pause dir: %w", err)
}
for _, e := range entries {
if e.IsDir() {
continue
}
name := e.Name()
if name == layout.SandboxCowName || name == snapshotMetaFile {
continue
}
if err := linkOrCopyFile(filepath.Join(srcDir, name), filepath.Join(dstDir, name)); err != nil {
return fmt.Errorf("copy %s: %w", name, err)
}
}
return nil
}
// linkOrCopyFile hardlinks from→to, falling back to a sparse-preserving copy
// when the two paths live on different filesystems (os.Link returns EXDEV). A
// plain byte copy would materialise the zero pages punched out of memory-ranges
// — inflating a multi-GB snapshot to its full apparent size — so the fallback
// uses `cp --sparse=always`, which re-detects and re-punches the holes.
func linkOrCopyFile(from, to string) error {
if err := os.Link(from, to); err == nil {
return nil
}
if out, err := exec.Command("cp", "--sparse=always", from, to).CombinedOutput(); err != nil {
return fmt.Errorf("sparse copy: %s: %w", string(out), err)
}
return nil
}
// DeleteSnapshot removes a template snapshot directory. Refuses deletion
// while any in-memory sandbox is still derived from this template — even
// though Linux unlink lets the open loop device keep working, the agent
// would be unable to re-acquire it after a restart and a concurrent
// LoopRegistry.Acquire would fail mid-flight.
func (m *Manager) DeleteSnapshot(teamID, templateID pgtype.UUID) error {
m.mu.RLock()
var users []string
for sbID, sb := range m.boxes {
if sb.TemplateTeamID == teamID.Bytes && sb.TemplateID == templateID.Bytes {
users = append(users, sbID)
}
}
m.mu.RUnlock()
if len(users) > 0 {
return fmt.Errorf("snapshot %s/%s is in use by %d sandbox(es): %v",
id.UUIDString(teamID), id.UUIDString(templateID), len(users), users)
}
dir := layout.TemplateDir(m.cfg.WrennDir, teamID, templateID)
if err := os.RemoveAll(dir); err != nil {
return fmt.Errorf("remove snapshot dir: %w", err)
}
// Prune the parent team directory if this was the team's last template,
// so deleting a template leaves no residual directory behind.
pruneEmptyDir(filepath.Dir(dir))
slog.Info("template snapshot deleted", "team_id", teamID, "template_id", templateID)
return nil
}
// pruneEmptyDir removes dir only when it is empty. Best-effort: a non-empty
// dir or any filesystem error is silently ignored. Used to clean up a team's
// template parent directory once its last template has been removed.
func pruneEmptyDir(dir string) {
entries, err := os.ReadDir(dir)
if err != nil || len(entries) > 0 {
return
}
if err := os.Remove(dir); err != nil {
slog.Warn("prune empty template dir", "path", dir, "error", err)
}
}
// FlattenRootfs writes the current dm-snapshot state to a new template
// rootfs without taking a memory snapshot. Used to publish a sandbox's
// disk-only state as a base image. The sandbox is briefly paused for I/O
// consistency.
func (m *Manager) FlattenRootfs(ctx context.Context, sandboxID string, teamID, templateID pgtype.UUID) (int64, error) {
sb, err := m.get(sandboxID)
if err != nil {
return 0, err
}
sb.lifecycleMu.Lock()
defer sb.lifecycleMu.Unlock()
if sb.Status != models.StatusRunning {
return 0, fmt.Errorf("%w: %s (status: %s)", ErrNotRunning, sandboxID, sb.Status)
}
dstDir := layout.TemplateDir(m.cfg.WrennDir, teamID, templateID)
stageDir := filepath.Join(layout.SandboxesDir(m.cfg.WrennDir),
fmt.Sprintf(".stage-%s-%d", sandboxID, time.Now().UnixNano()))
if err := os.MkdirAll(stageDir, 0o755); err != nil {
return 0, fmt.Errorf("mkdir stage dir: %w", err)
}
defer os.RemoveAll(stageDir)
// quiesceAndPauseCH drains connections and calls envd /snapshot/prepare
// (sync + drop_caches) before ch.pause. A plain ch.pause only freezes the
// vCPUs — guest VFS page-cache writes (e.g. freshly pip-installed files)
// would not yet have reached the block device, so the flattened rootfs
// would capture empty files. Matches CreateSnapshot and Pause.
if err := m.quiesceAndPauseCH(ctx, sb); err != nil {
// quiesceAndPauseCH force-closes tracked connections before ch.pause.
// On failure, resume and reset so the sandbox doesn't get stuck
// refusing new proxy connections. Mirrors CreateSnapshot.
_ = m.vm.Resume(context.Background(), sandboxID)
sb.connTracker.Reset()
return 0, fmt.Errorf("quiesce for flatten: %w", err)
}
flattenErr := devicemapper.FlattenSnapshot(sb.dmDevice.DevicePath, filepath.Join(stageDir, "rootfs.ext4"))
if rerr := m.vm.Resume(context.Background(), sandboxID); rerr != nil {
slog.Warn("vm resume after flatten", "id", sandboxID, "error", rerr)
}
sb.connTracker.Reset()
if flattenErr != nil {
return 0, fmt.Errorf("flatten: %w", flattenErr)
}
if err := promoteSnapshotDir(stageDir, dstDir); err != nil {
return 0, fmt.Errorf("promote rootfs: %w", err)
}
size, err := snapshot.DirSize(dstDir, "")
if err != nil {
slog.Warn("flatten size calc failed", "id", sandboxID, "error", err)
}
return size, nil
}
// pauseAllConcurrency caps how many sandboxes PauseAll snapshots in
// parallel. Each Pause writes guest RAM to disk and contends on host I/O
// bandwidth, so unbounded parallelism would thrash. 8 keeps a busy host
// from sequential 30s tails without saturating disk on smaller hosts.
const pauseAllConcurrency = 8
// PauseAll pauses every running sandbox. Used by the host agent on graceful
// shutdown so VMs can be resumed by the next agent instance.
//
// Runs Pauses concurrently with a bounded worker pool: per-sandbox Pause
// blocks on the post-resume memory loader (up to 30s) plus ch.snapshot of
// guest RAM (seconds-to-tens-of-seconds), so a serial loop would multiply
// the shutdown budget by the running count. lifecycleMu is per-sandbox so
// there is no cross-sandbox locking; m.mu is taken briefly for status flips.
//
// On each successful Pause, emits a sandbox.auto_paused event synchronously
// so the CP can mark the DB row paused before the agent process exits. Sync
// (not async) because Shutdown fires the process down right after — async
// sends would race with exit. HostMonitor reconciles any event we fail to
// deliver here, but emitting promptly avoids leaving sandboxes stuck as
// 'running' in the DB until the next monitor tick or unreachable threshold.
func (m *Manager) PauseAll(ctx context.Context) {
m.mu.RLock()
ids := make([]string, 0, len(m.boxes))
for id, sb := range m.boxes {
if sb.Status == models.StatusRunning {
ids = append(ids, id)
}
}
m.mu.RUnlock()
if len(ids) == 0 {
return
}
sem := make(chan struct{}, pauseAllConcurrency)
var wg sync.WaitGroup
for _, sbID := range ids {
wg.Add(1)
sem <- struct{}{}
go func(sbID string) {
defer wg.Done()
defer func() { <-sem }()
if err := m.Pause(ctx, sbID); err != nil {
slog.Warn("PauseAll: pause failed", "id", sbID, "error", err)
return
}
if m.eventSender == nil {
return
}
if err := m.eventSender.Send(ctx, LifecycleEvent{
Event: "sandbox.auto_paused",
SandboxID: sbID,
}); err != nil {
slog.Warn("PauseAll: notify CP failed (reconciler will catch it)", "id", sbID, "error", err)
}
}(sbID)
}
wg.Wait()
}
// CleanupOrphanPauseDirs removes leftover *.staging-*, *.stage-*, and *.trash-*
// dirs under sandboxes/ from any Pause/snapshot/flatten that crashed before
// completing its swap or promote. Safe to call at agent startup before any
// sandbox is created or restored.
//
// Per-sandbox cleanup happens implicitly during Destroy (which removes the
// whole PauseSnapshotDir) — this function only handles agent-crash orphans.
func CleanupOrphanPauseDirs(wrennDir string) {
sandboxesDir := layout.SandboxesDir(wrennDir)
entries, err := os.ReadDir(sandboxesDir)
if err != nil {
// Directory does not exist yet — nothing to clean.
return
}
for _, e := range entries {
if !e.IsDir() {
continue
}
name := e.Name()
// ".stage-" is the prefix used by snapshot/flatten staging dirs;
// ".staging-" + ".trash-" are used by Pause's swap. (".stage-" is not a
// substring of ".staging-", so all three need an explicit check.)
if !strings.Contains(name, ".stage-") &&
!strings.Contains(name, ".staging-") &&
!strings.Contains(name, ".trash-") {
continue
}
path := filepath.Join(sandboxesDir, name)
if err := os.RemoveAll(path); err != nil {
slog.Warn("orphan pause artifact remove failed", "path", path, "error", err)
continue
}
slog.Info("removed orphan pause artifact", "path", path)
}
}