package sandbox import ( "context" "errors" "fmt" "log/slog" "net" "os" "os/exec" "path/filepath" "sync" "time" "github.com/jackc/pgx/v5/pgtype" "git.omukk.dev/wrenn/wrenn/internal/devicemapper" "git.omukk.dev/wrenn/wrenn/internal/envdclient" "git.omukk.dev/wrenn/wrenn/internal/layout" "git.omukk.dev/wrenn/wrenn/internal/models" "git.omukk.dev/wrenn/wrenn/internal/network" "git.omukk.dev/wrenn/wrenn/internal/snapshot" "git.omukk.dev/wrenn/wrenn/internal/vm" "git.omukk.dev/wrenn/wrenn/pkg/id" envdpb "git.omukk.dev/wrenn/wrenn/proto/envd/gen" ) // ErrNotFound is returned when a sandbox is not present in the in-memory map. var ErrNotFound = errors.New("sandbox not found") // Config holds the paths and defaults for the sandbox manager. type Config struct { WrennDir string // root directory (e.g. /var/lib/wrenn); all sub-paths derived via layout package EnvdTimeout time.Duration DefaultRootfsSizeMB int // target size for template rootfs images; 0 → DefaultDiskSizeMB // Resolved at startup by the host agent. KernelPath string // path to the latest vmlinux-x.y.z KernelVersion string // semver extracted from filename VMMBin string // path to the cloud-hypervisor binary VMMVersion string // semver from cloud-hypervisor --version AgentVersion string // host agent version (injected via ldflags) } // LifecycleEvent describes an autonomous state change initiated by the agent. type LifecycleEvent struct { Event string SandboxID string } // EventSender sends autonomous lifecycle events to the control plane. type EventSender interface { SendAsync(event LifecycleEvent) } // Manager orchestrates sandbox lifecycle: VM, network, filesystem, envd. type Manager struct { cfg Config vm *vm.Manager slots *network.SlotAllocator loops *devicemapper.LoopRegistry mu sync.RWMutex boxes map[string]*sandboxState stopCh chan struct{} autoPausedMu sync.Mutex autoPausedIDs []string // onDestroy is called with the sandbox ID after cleanup completes. // Used by ProxyHandler to evict cached reverse proxies. onDestroy func(sandboxID string) // eventSender sends autonomous lifecycle events (auto-pause, auto-destroy) // to the CP via HTTP callback. Optional — nil means events are only // propagated through the HostMonitor reconciler. eventSender EventSender } // SetOnDestroy registers a callback invoked after each sandbox is cleaned up. func (m *Manager) SetOnDestroy(fn func(sandboxID string)) { m.onDestroy = fn } // SetEventSender registers the callback sender for autonomous lifecycle events. func (m *Manager) SetEventSender(sender EventSender) { m.eventSender = sender } // sandboxState holds the runtime state for a single sandbox. type sandboxState struct { models.Sandbox lifecycleMu sync.Mutex // serializes Pause/Destroy/Resume on this sandbox slot *network.Slot client *envdclient.Client connTracker *ConnTracker // tracks in-flight proxy connections for pre-pause drain dmDevice *devicemapper.SnapshotDevice baseImagePath string // path to the base template rootfs (for loop registry release) // Metrics sampling state. vmmPID int // VMM process PID (child of unshare wrapper) ring *metricsRing // tiered ring buffers for CPU/mem/disk metrics samplerCancel context.CancelFunc // cancels the per-sandbox sampling goroutine samplerDone chan struct{} // closed when the sampling goroutine exits } // buildMetadata constructs the metadata map with version information. func (m *Manager) buildMetadata(envdVersion string) map[string]string { meta := map[string]string{ "kernel_version": m.cfg.KernelVersion, "vmm_version": m.cfg.VMMVersion, "agent_version": m.cfg.AgentVersion, } if envdVersion != "" { meta["envd_version"] = envdVersion } return meta } // resolveKernelPath returns the kernel path for the given version hint. // If the exact version exists on disk, it is used. Otherwise, falls back to // the latest kernel (m.cfg.KernelPath). func (m *Manager) resolveKernelPath(versionHint string) string { if versionHint == "" { return m.cfg.KernelPath } exact := layout.KernelPathVersioned(m.cfg.WrennDir, versionHint) if _, err := os.Stat(exact); err == nil { return exact } slog.Warn("requested kernel version not found, using latest", "requested", versionHint, "latest", m.cfg.KernelVersion) return m.cfg.KernelPath } // New creates a new sandbox manager. func New(cfg Config) *Manager { if cfg.EnvdTimeout == 0 { cfg.EnvdTimeout = 30 * time.Second } return &Manager{ cfg: cfg, vm: vm.NewManager(), slots: network.NewSlotAllocator(), loops: devicemapper.NewLoopRegistry(), boxes: make(map[string]*sandboxState), stopCh: make(chan struct{}), } } // Create boots a new sandbox: clone rootfs, set up network, start VM, wait for envd. // If sandboxID is empty, a new ID is generated. func (m *Manager) Create(ctx context.Context, sandboxID string, teamID, templateID pgtype.UUID, vcpus, memoryMB, timeoutSec, diskSizeMB int) (*models.Sandbox, error) { if sandboxID == "" { sandboxID = id.FormatSandboxID(id.NewSandboxID()) } if vcpus <= 0 { vcpus = 1 } if memoryMB <= 0 { memoryMB = 512 } if diskSizeMB <= 0 { diskSizeMB = 5120 // 5 GB default } // Check if template refers to a CH snapshot (has config.json). tmplDir := layout.TemplateDir(m.cfg.WrennDir, teamID, templateID) if _, err := os.Stat(filepath.Join(tmplDir, snapshot.CHConfigFile)); err == nil { return m.createFromSnapshot(ctx, sandboxID, teamID, templateID, vcpus, memoryMB, timeoutSec, diskSizeMB) } // Resolve base rootfs image. baseRootfs := layout.TemplateRootfs(m.cfg.WrennDir, teamID, templateID) if _, err := os.Stat(baseRootfs); err != nil { return nil, fmt.Errorf("base rootfs not found at %s: %w", baseRootfs, err) } // Acquire shared read-only loop device for the base image. originLoop, err := m.loops.Acquire(baseRootfs) if err != nil { return nil, fmt.Errorf("acquire loop device: %w", err) } originSize, err := devicemapper.OriginSizeBytes(originLoop) if err != nil { m.loops.Release(baseRootfs) return nil, fmt.Errorf("get origin size: %w", err) } // Create dm-snapshot with per-sandbox CoW file. // CoW must be at least as large as the origin — if every block is // rewritten, the CoW stores a full copy. Undersized CoW causes // dm-snapshot invalidation → EIO on all guest I/O. dmName := "wrenn-" + sandboxID cowPath := filepath.Join(layout.SandboxesDir(m.cfg.WrennDir), fmt.Sprintf("%s.cow", sandboxID)) cowSize := max(int64(diskSizeMB)*1024*1024, originSize) dmDev, err := devicemapper.CreateSnapshot(dmName, originLoop, cowPath, originSize, cowSize) if err != nil { m.loops.Release(baseRootfs) return nil, fmt.Errorf("create dm-snapshot: %w", err) } res := &createResources{ sandboxID: sandboxID, loops: m.loops, loopImage: baseRootfs, dmDevice: dmDev, cowPath: cowPath, slots: m.slots, } // Allocate network slot. slotIdx, err := m.slots.Allocate() if err != nil { res.rollback() return nil, fmt.Errorf("allocate network slot: %w", err) } res.slotIdx = slotIdx slot := network.NewSlot(slotIdx) // Set up network. if err := network.CreateNetwork(slot); err != nil { res.rollback() return nil, fmt.Errorf("create network: %w", err) } res.slot = slot // Boot VM — CH gets the dm device path. vmCfg := vm.VMConfig{ SandboxID: sandboxID, TemplateID: id.UUIDString(templateID), KernelPath: m.cfg.KernelPath, RootfsPath: dmDev.DevicePath, VCPUs: vcpus, MemoryMB: memoryMB, NetworkNamespace: slot.NamespaceID, TapDevice: slot.TapName, TapMAC: slot.TapMAC, GuestIP: slot.GuestIP, GatewayIP: slot.TapIP, NetMask: slot.GuestNetMask, VMMBin: m.cfg.VMMBin, } if _, err := m.vm.Create(ctx, vmCfg); err != nil { res.rollback() return nil, fmt.Errorf("create VM: %w", err) } res.vm = m.vm // Wait for envd to be ready. client := envdclient.New(slot.HostIP.String()) waitCtx, waitCancel := context.WithTimeout(ctx, m.cfg.EnvdTimeout) defer waitCancel() if err := client.WaitUntilReady(waitCtx); err != nil { res.rollback() return nil, fmt.Errorf("wait for envd: %w", err) } // Fetch envd version (best-effort). envdVersion, _ := client.FetchVersion(ctx) now := time.Now() sb := &sandboxState{ Sandbox: models.Sandbox{ ID: sandboxID, Status: models.StatusRunning, TemplateTeamID: teamID.Bytes, TemplateID: templateID.Bytes, VCPUs: vcpus, MemoryMB: memoryMB, TimeoutSec: timeoutSec, SlotIndex: slotIdx, HostIP: slot.HostIP, RootfsPath: dmDev.DevicePath, CreatedAt: now, LastActiveAt: now, Metadata: m.buildMetadata(envdVersion), }, slot: slot, client: client, connTracker: &ConnTracker{}, dmDevice: dmDev, baseImagePath: baseRootfs, } m.mu.Lock() m.boxes[sandboxID] = sb m.mu.Unlock() m.startSampler(sb) m.startCrashWatcher(sb) slog.Info("sandbox created", "id", sandboxID, "team_id", teamID, "template_id", templateID, "host_ip", slot.HostIP.String(), "dm_device", dmDev.DevicePath, ) return &sb.Sandbox, nil } // Destroy stops and cleans up a sandbox. If the sandbox is running, its VM, // network, and rootfs are torn down. Any pause snapshot files are also removed. func (m *Manager) Destroy(ctx context.Context, sandboxID string) error { m.mu.Lock() sb, ok := m.boxes[sandboxID] if ok { delete(m.boxes, sandboxID) } m.mu.Unlock() if ok { // Wait for any in-progress Pause to finish before tearing down resources. sb.lifecycleMu.Lock() defer sb.lifecycleMu.Unlock() m.cleanup(ctx, sb) } // Always clean up pause snapshot files (may exist if sandbox was paused). if err := os.RemoveAll(layout.PauseSnapshotDir(m.cfg.WrennDir, sandboxID)); err != nil { slog.Warn("snapshot cleanup error", "id", sandboxID, "error", err) } if m.onDestroy != nil { m.onDestroy(sandboxID) } slog.Info("sandbox destroyed", "id", sandboxID) return nil } // cleanup tears down all resources for a sandbox. func (m *Manager) cleanup(ctx context.Context, sb *sandboxState) { m.stopSampler(sb) if err := m.vm.Destroy(ctx, sb.ID); err != nil { slog.Warn("vm destroy error", "id", sb.ID, "error", err) } if err := network.RemoveNetwork(sb.slot); err != nil { slog.Warn("network cleanup error", "id", sb.ID, "error", err) } m.slots.Release(sb.SlotIndex) // Tear down dm-snapshot and release the base image loop device. if sb.dmDevice != nil { if err := devicemapper.RemoveSnapshot(context.Background(), sb.dmDevice); err != nil { slog.Warn("dm-snapshot remove error", "id", sb.ID, "error", err) } os.Remove(sb.dmDevice.CowPath) } if sb.baseImagePath != "" { m.loops.Release(sb.baseImagePath) } } // cleanupPauseFailure is best-effort teardown when a pause operation fails // after the VM has already been destroyed. It releases all resources and removes // the sandbox from the in-memory map. func (m *Manager) cleanupPauseFailure(sb *sandboxState, sandboxID string, pauseDir string) { warnErr("snapshot dir cleanup error", sandboxID, os.RemoveAll(pauseDir)) warnErr("network cleanup error during pause", sandboxID, network.RemoveNetwork(sb.slot)) m.slots.Release(sb.SlotIndex) if sb.dmDevice != nil { warnErr("dm-snapshot remove error during pause", sandboxID, devicemapper.RemoveSnapshot(context.Background(), sb.dmDevice)) os.Remove(sb.dmDevice.CowPath) } if sb.baseImagePath != "" { m.loops.Release(sb.baseImagePath) } m.mu.Lock() delete(m.boxes, sandboxID) m.mu.Unlock() } // Pause takes a snapshot of a running sandbox, then destroys all resources. // The sandbox's snapshot files are stored at SnapshotsDir/{sandboxID}/. // After this call, the sandbox is no longer running but can be resumed. func (m *Manager) Pause(ctx context.Context, sandboxID string) error { sb, err := m.get(sandboxID) if err != nil { return err } // Serialize lifecycle operations on this sandbox to prevent concurrent // Pause/Destroy calls from corrupting VM state. sb.lifecycleMu.Lock() defer sb.lifecycleMu.Unlock() if sb.Status != models.StatusRunning { return fmt.Errorf("sandbox %s is not running (status: %s)", sandboxID, sb.Status) } // Mark sandbox as pausing to block new exec/file/PTY operations. m.mu.Lock() sb.Status = models.StatusPausing m.mu.Unlock() // restoreRunning reverts state if any pre-freeze step fails. restoreRunning := func() { _ = m.vm.UpdateBalloon(context.Background(), sandboxID, 0) sb.connTracker.Reset() m.mu.Lock() sb.Status = models.StatusRunning m.mu.Unlock() m.startSampler(sb) } // Stop the metrics sampler goroutine before tearing down any resources // it reads (dm device, VMM PID). Without this, the sampler // leaks on every successful pause. m.stopSampler(sb) // ── Step 1: Isolate from external traffic ───────────────────────── // Drain in-flight proxy connections (grace period for clean shutdown). sb.connTracker.Drain(5 * time.Second) // Force-close any connections that didn't finish during grace period. sb.connTracker.ForceClose() slog.Debug("pause: external connections closed", "id", sandboxID) // Close host-side idle connections to envd so FIN packets propagate // to the guest kernel before snapshot. sb.client.CloseIdleConnections() // ── Step 2: Drop page cache ────────────────────────────────────── // Signal envd to quiesce: drops page cache, stops port subsystem, // marks connections for post-restore cleanup. Page cache drop can // take significant time on large-memory VMs (20GB+). func() { prepCtx, prepCancel := context.WithTimeout(ctx, 30*time.Second) defer prepCancel() if err := sb.client.PrepareSnapshot(prepCtx); err != nil { slog.Warn("pause: pre-snapshot quiesce failed (best-effort)", "id", sandboxID, "error", err) } else { slog.Debug("pause: envd quiesced", "id", sandboxID) } }() // ── Step 3: Inflate balloon to reclaim free guest memory ───────── // Freed pages become zero in the snapshot's memory-ranges file. // CH v52+ writes sparse snapshots natively (SEEK_DATA/SEEK_HOLE). func() { memUsed, err := readEnvdMemUsed(ctx, sb.client) if err != nil { slog.Debug("pause: could not read guest memory, skipping balloon inflate", "id", sandboxID, "error", err) return } usedMiB := int(memUsed / (1024 * 1024)) keepMiB := max(usedMiB*3/2, 512) inflateMiB := sb.MemoryMB - keepMiB if inflateMiB <= 0 { slog.Debug("pause: not enough free memory for balloon inflate", "id", sandboxID, "used_mib", usedMiB, "total_mib", sb.MemoryMB) return } balloonCtx, balloonCancel := context.WithTimeout(ctx, 10*time.Second) defer balloonCancel() if err := m.vm.UpdateBalloon(balloonCtx, sandboxID, inflateMiB); err != nil { slog.Debug("pause: balloon inflate failed (non-fatal)", "id", sandboxID, "error", err) return } time.Sleep(2 * time.Second) slog.Info("pause: balloon inflated", "id", sandboxID, "inflate_mib", inflateMiB, "guest_used_mib", usedMiB) }() // ── Step 4: Freeze vCPUs ───────────────────────────────────────── pauseStart := time.Now() if err := m.vm.Pause(ctx, sandboxID); err != nil { restoreRunning() return fmt.Errorf("pause VM: %w", err) } slog.Debug("pause: VM paused", "id", sandboxID, "elapsed", time.Since(pauseStart)) // resumeOnError unpauses the VM so the sandbox stays usable when a // post-freeze step fails. If the resume itself fails, the sandbox is // frozen and unrecoverable — destroy it to avoid a zombie. resumeOnError := func() { resumeCtx, resumeCancel := context.WithTimeout(context.Background(), 30*time.Second) defer resumeCancel() if err := m.vm.Resume(resumeCtx, sandboxID); err != nil { slog.Error("failed to resume VM after pause error — destroying frozen sandbox", "id", sandboxID, "error", err) m.cleanup(context.Background(), sb) m.mu.Lock() delete(m.boxes, sandboxID) m.mu.Unlock() if m.onDestroy != nil { m.onDestroy(sandboxID) } return } restoreRunning() } // ── Step 5: Take CH snapshot ───────────────────────────────────── // Snapshot to a temp dir first. If the sandbox was previously resumed, // the old pauseDir still contains memory-ranges that CH's uffd handler // is lazily paging from. CH refuses to overwrite existing files (EEXIST), // so we write to a fresh dir and swap after the VM is destroyed. pauseDir := layout.PauseSnapshotDir(m.cfg.WrennDir, sandboxID) tmpPauseDir := pauseDir + ".new" if err := os.RemoveAll(tmpPauseDir); err != nil { resumeOnError() return fmt.Errorf("clean temp snapshot dir: %w", err) } if err := os.MkdirAll(tmpPauseDir, 0755); err != nil { resumeOnError() return fmt.Errorf("create snapshot dir: %w", err) } snapshotStart := time.Now() if err := m.vm.Snapshot(ctx, sandboxID, tmpPauseDir); err != nil { slog.Error("pause: snapshot failed", "id", sandboxID, "elapsed", time.Since(snapshotStart), "error", err) warnErr("snapshot dir cleanup error", sandboxID, os.RemoveAll(tmpPauseDir)) resumeOnError() return fmt.Errorf("create VM snapshot: %w", err) } slog.Debug("pause: CH snapshot created", "id", sandboxID, "elapsed", time.Since(snapshotStart)) // ── Step 6: Destroy the VM so CH releases the dm device ────────── if err := m.vm.Destroy(ctx, sb.ID); err != nil { slog.Warn("vm destroy error during pause", "id", sb.ID, "error", err) } // CH process is dead — uffd handler no longer reads old memory-ranges. // Replace old snapshot dir with new one. if err := os.RemoveAll(pauseDir); err != nil { slog.Warn("pause: failed to remove old snapshot dir", "id", sandboxID, "error", err) } if err := os.Rename(tmpPauseDir, pauseDir); err != nil { m.cleanupPauseFailure(sb, sandboxID, pauseDir) return fmt.Errorf("rename snapshot dir: %w", err) } // ── Step 7: Remove dm-snapshot and save CoW ────────────────────── if sb.dmDevice != nil { dmDev := sb.dmDevice sb.dmDevice = nil if err := devicemapper.RemoveSnapshot(ctx, dmDev); err != nil { m.cleanupPauseFailure(sb, sandboxID, pauseDir) return fmt.Errorf("remove dm-snapshot: %w", err) } snapshotCow := snapshot.CowPath(pauseDir, "") if err := os.Rename(dmDev.CowPath, snapshotCow); err != nil { os.Remove(dmDev.CowPath) m.cleanupPauseFailure(sb, sandboxID, pauseDir) return fmt.Errorf("move cow file: %w", err) } if err := snapshot.WriteMeta(pauseDir, "", &snapshot.RootfsMeta{ BaseTemplate: sb.baseImagePath, TemplateID: id.UUIDString(pgtype.UUID{Bytes: sb.TemplateID, Valid: true}), VCPUs: sb.VCPUs, MemoryMB: sb.MemoryMB, }); err != nil { m.cleanupPauseFailure(sb, sandboxID, pauseDir) return fmt.Errorf("write rootfs meta: %w", err) } } // ── Step 8: Clean up remaining resources ───────────────────────── if err := network.RemoveNetwork(sb.slot); err != nil { slog.Warn("network cleanup error during pause", "id", sb.ID, "error", err) } m.slots.Release(sb.SlotIndex) if sb.baseImagePath != "" { m.loops.Release(sb.baseImagePath) } m.mu.Lock() delete(m.boxes, sandboxID) m.mu.Unlock() slog.Info("sandbox paused", "id", sandboxID, "total_elapsed", time.Since(pauseStart)) return nil } // Resume restores a paused sandbox from its CH snapshot. // CH handles memory restore internally (with on-demand paging). // The sandbox gets a new network slot. // Optional defaultUser and defaultEnv are applied via PostInit with // sandbox_id and template_id so envd picks up the new sandbox's metadata. func (m *Manager) Resume(ctx context.Context, sandboxID string, timeoutSec int, kernelVersion string, defaultUser string, defaultEnv map[string]string) (*models.Sandbox, error) { pauseDir := layout.PauseSnapshotDir(m.cfg.WrennDir, sandboxID) if _, err := os.Stat(pauseDir); err != nil { return nil, fmt.Errorf("no snapshot found for sandbox %s", sandboxID) } // Read rootfs metadata to find the base template image. meta, err := snapshot.ReadMeta(pauseDir, "") if err != nil { return nil, fmt.Errorf("read rootfs meta: %w", err) } // Acquire the base image loop device and restore dm-snapshot from saved CoW. baseImagePath := meta.BaseTemplate originLoop, err := m.loops.Acquire(baseImagePath) if err != nil { return nil, fmt.Errorf("acquire loop device: %w", err) } originSize, err := devicemapper.OriginSizeBytes(originLoop) if err != nil { m.loops.Release(baseImagePath) return nil, fmt.Errorf("get origin size: %w", err) } // Move CoW file from snapshot dir to sandboxes dir for the running sandbox. savedCow := snapshot.CowPath(pauseDir, "") cowPath := filepath.Join(layout.SandboxesDir(m.cfg.WrennDir), fmt.Sprintf("%s.cow", sandboxID)) if err := os.Rename(savedCow, cowPath); err != nil { m.loops.Release(baseImagePath) return nil, fmt.Errorf("move cow file: %w", err) } // Restore dm-snapshot from existing persistent CoW file. dmName := "wrenn-" + sandboxID dmDev, err := devicemapper.RestoreSnapshot(ctx, dmName, originLoop, cowPath, originSize) if err != nil { m.loops.Release(baseImagePath) os.Rename(cowPath, savedCow) return nil, fmt.Errorf("restore dm-snapshot: %w", err) } res := &createResources{ sandboxID: sandboxID, loops: m.loops, loopImage: baseImagePath, dmDevice: dmDev, slots: m.slots, rollCow: func() { if err := os.Rename(cowPath, savedCow); err != nil { slog.Warn("failed to rollback cow file", "src", cowPath, "dst", savedCow, "error", err) } }, } // Allocate network slot. slotIdx, err := m.slots.Allocate() if err != nil { res.rollback() return nil, fmt.Errorf("allocate network slot: %w", err) } res.slotIdx = slotIdx slot := network.NewSlot(slotIdx) if err := network.CreateNetwork(slot); err != nil { res.rollback() return nil, fmt.Errorf("create network: %w", err) } res.slot = slot // Restore VM from CH snapshot. vmCfg := vm.VMConfig{ SandboxID: sandboxID, TemplateID: meta.TemplateID, KernelPath: m.resolveKernelPath(kernelVersion), RootfsPath: dmDev.DevicePath, MemoryMB: meta.MemoryMB, NetworkNamespace: slot.NamespaceID, TapDevice: slot.TapName, TapMAC: slot.TapMAC, GuestIP: slot.GuestIP, GatewayIP: slot.TapIP, NetMask: slot.GuestNetMask, VMMBin: m.cfg.VMMBin, } if _, err := m.vm.CreateFromSnapshot(ctx, vmCfg, pauseDir); err != nil { res.rollback() return nil, fmt.Errorf("restore VM from snapshot: %w", err) } res.vm = m.vm // Wait for envd to be ready. client := envdclient.New(slot.HostIP.String()) waitCtx, waitCancel := context.WithTimeout(ctx, m.cfg.EnvdTimeout) if err := client.WaitUntilReady(waitCtx); err != nil { waitCancel() res.rollback() return nil, fmt.Errorf("wait for envd: %w", err) } waitCancel() // PostInit with sandbox_id and template_id so envd sets metadata env vars. // Fire-and-forget: post-init is non-critical (metadata/env vars), and // blocking here widens the window for the CP monitor to race against us. go func() { initCtx, initCancel := context.WithTimeout(context.Background(), 10*time.Second) defer initCancel() if err := client.PostInitWithDefaults(initCtx, defaultUser, defaultEnv, sandboxID, meta.TemplateID); err != nil { slog.Warn("post-init failed after resume, metadata may be stale", "sandbox", sandboxID, "error", err) } }() // Deflate balloon — the snapshot was taken with an inflated balloon. if err := m.vm.UpdateBalloon(ctx, sandboxID, 0); err != nil { slog.Debug("resume: balloon deflate failed (non-fatal)", "id", sandboxID, "error", err) } // Wait for balloon deflation to settle. With OnDemand UFFD restore, // the guest is under severe memory pressure while the balloon is still // inflated. If the sampler or memory reclaimer runs during this window, // the resulting page fault storm can make the guest kernel unresponsive. if meta.MemoryMB > 0 { threshold := int64(float64(meta.MemoryMB) * 0.80 * 1024 * 1024) settleCtx, settleCancel := context.WithTimeout(ctx, 10*time.Second) ticker := time.NewTicker(500 * time.Millisecond) settled := false for !settled { select { case <-settleCtx.Done(): slog.Warn("resume: balloon deflation did not settle in time", "id", sandboxID) settled = true case <-ticker.C: used, err := readEnvdMemUsed(settleCtx, client) if err != nil { continue } if used < threshold { slog.Info("resume: balloon deflation settled", "id", sandboxID, "used_mib", used/(1024*1024)) settled = true } } } ticker.Stop() settleCancel() } // Fetch envd version (best-effort). envdVersion, _ := client.FetchVersion(ctx) vcpus := meta.VCPUs if vcpus <= 0 { vcpus = 1 } memoryMB := meta.MemoryMB if memoryMB <= 0 { memoryMB = 512 } now := time.Now() sb := &sandboxState{ Sandbox: models.Sandbox{ ID: sandboxID, Status: models.StatusRunning, VCPUs: vcpus, MemoryMB: memoryMB, TimeoutSec: timeoutSec, SlotIndex: slotIdx, HostIP: slot.HostIP, RootfsPath: dmDev.DevicePath, CreatedAt: now, LastActiveAt: now, Metadata: m.buildMetadata(envdVersion), }, slot: slot, client: client, connTracker: &ConnTracker{}, dmDevice: dmDev, baseImagePath: baseImagePath, } m.mu.Lock() m.boxes[sandboxID] = sb m.mu.Unlock() m.startSampler(sb) m.startCrashWatcher(sb) slog.Info("sandbox resumed from snapshot", "id", sandboxID, "host_ip", slot.HostIP.String(), "dm_device", dmDev.DevicePath, "vcpus", vcpus, ) return &sb.Sandbox, nil } // CreateSnapshot creates a reusable template from a sandbox. Works on both // running and paused sandboxes. If the sandbox is running, it is paused first. // The sandbox remains paused after this call (it can still be resumed). // // The rootfs is flattened (base + CoW merged) into a new standalone rootfs.ext4 // so the template has no dependency on the original base image. Memory state // and VM snapshot files are copied as-is. func (m *Manager) CreateSnapshot(ctx context.Context, sandboxID string, teamID, templateID pgtype.UUID) (int64, error) { // If the sandbox is running, pause it first. if _, err := m.get(sandboxID); err == nil { if err := m.Pause(ctx, sandboxID); err != nil { return 0, fmt.Errorf("pause sandbox: %w", err) } } // At this point, pause snapshot files must exist. pauseDir := layout.PauseSnapshotDir(m.cfg.WrennDir, sandboxID) if _, err := os.Stat(pauseDir); err != nil { return 0, fmt.Errorf("no snapshot found for sandbox %s", sandboxID) } // Create template directory. dstDir := layout.TemplateDir(m.cfg.WrennDir, teamID, templateID) if err := os.MkdirAll(dstDir, 0755); err != nil { return 0, fmt.Errorf("create template dir: %w", err) } // Copy CH snapshot files (config.json, memory-ranges, state.json). for _, fname := range []string{snapshot.CHConfigFile, snapshot.CHMemRangesFile, snapshot.CHStateFile} { src := filepath.Join(pauseDir, fname) dst := filepath.Join(dstDir, fname) if _, err := os.Stat(src); err != nil { continue // some files may not exist } if err := copyFile(src, dst); err != nil { warnErr("template dir cleanup error", dstDir, os.RemoveAll(dstDir)) return 0, fmt.Errorf("copy %s: %w", fname, err) } } // Flatten rootfs: temporarily set up dm device from base + CoW, dd to new image. meta, err := snapshot.ReadMeta(pauseDir, "") if err != nil { warnErr("template dir cleanup error", dstDir, os.RemoveAll(dstDir)) return 0, fmt.Errorf("read rootfs meta: %w", err) } originLoop, err := m.loops.Acquire(meta.BaseTemplate) if err != nil { warnErr("template dir cleanup error", dstDir, os.RemoveAll(dstDir)) return 0, fmt.Errorf("acquire loop device for flatten: %w", err) } originSize, err := devicemapper.OriginSizeBytes(originLoop) if err != nil { m.loops.Release(meta.BaseTemplate) warnErr("template dir cleanup error", dstDir, os.RemoveAll(dstDir)) return 0, fmt.Errorf("get origin size: %w", err) } // Temporarily restore the dm-snapshot to read the merged view. cowPath := snapshot.CowPath(pauseDir, "") tmpDmName := "wrenn-flatten-" + sandboxID tmpDev, err := devicemapper.RestoreSnapshot(ctx, tmpDmName, originLoop, cowPath, originSize) if err != nil { m.loops.Release(meta.BaseTemplate) warnErr("template dir cleanup error", dstDir, os.RemoveAll(dstDir)) return 0, fmt.Errorf("restore dm-snapshot for flatten: %w", err) } // Flatten to new standalone rootfs. flattenedPath := filepath.Join(dstDir, snapshot.RootfsFileName) flattenErr := devicemapper.FlattenSnapshot(tmpDev.DevicePath, flattenedPath) // Always clean up the temporary dm device. warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), tmpDev)) m.loops.Release(meta.BaseTemplate) if flattenErr != nil { warnErr("template dir cleanup error", dstDir, os.RemoveAll(dstDir)) return 0, fmt.Errorf("flatten rootfs: %w", flattenErr) } sizeBytes, err := snapshot.DirSize(dstDir, "") if err != nil { slog.Warn("failed to calculate snapshot size", "error", err) } slog.Info("template snapshot created (rootfs flattened)", "sandbox", sandboxID, "team_id", teamID, "template_id", templateID, "size_bytes", sizeBytes, ) return sizeBytes, nil } // FlattenRootfs stops a running sandbox, flattens its device-mapper CoW // rootfs into a standalone rootfs.ext4, and cleans up all resources. // The result is an image-only template (no VM memory/CPU state) stored in // ImagesDir/{name}/rootfs.ext4. func (m *Manager) FlattenRootfs(ctx context.Context, sandboxID string, teamID, templateID pgtype.UUID) (int64, error) { m.mu.Lock() sb, ok := m.boxes[sandboxID] if ok { delete(m.boxes, sandboxID) } m.mu.Unlock() if !ok { return 0, fmt.Errorf("%w: %s", ErrNotFound, sandboxID) } // Flush guest page cache to disk before stopping the VM. Without this, // files written by the build (e.g. pip-installed packages) may exist in the // guest's page cache but not yet on the dm block device — flatten would then // capture 0-byte files. func() { syncCtx, cancel := context.WithTimeout(ctx, 10*time.Second) defer cancel() if _, err := sb.client.Exec(syncCtx, "/bin/sync", nil, nil); err != nil { slog.Warn("flatten: guest sync failed (non-fatal)", "id", sb.ID, "error", err) } }() // Stop the VM but keep the dm device alive for flattening. m.stopSampler(sb) if err := m.vm.Destroy(ctx, sb.ID); err != nil { slog.Warn("vm destroy error during flatten", "id", sb.ID, "error", err) } // Release network resources — not needed after VM is stopped. if err := network.RemoveNetwork(sb.slot); err != nil { slog.Warn("network cleanup error during flatten", "id", sb.ID, "error", err) } m.slots.Release(sb.SlotIndex) // Create template directory and flatten the dm-snapshot. flattenDstDir := layout.TemplateDir(m.cfg.WrennDir, teamID, templateID) if err := os.MkdirAll(flattenDstDir, 0755); err != nil { m.cleanupDM(sb) return 0, fmt.Errorf("create template dir: %w", err) } outputPath := filepath.Join(flattenDstDir, snapshot.RootfsFileName) if sb.dmDevice == nil { m.cleanupDM(sb) warnErr("template dir cleanup error", flattenDstDir, os.RemoveAll(flattenDstDir)) return 0, fmt.Errorf("sandbox %s has no dm device", sandboxID) } if err := devicemapper.FlattenSnapshot(sb.dmDevice.DevicePath, outputPath); err != nil { m.cleanupDM(sb) warnErr("template dir cleanup error", flattenDstDir, os.RemoveAll(flattenDstDir)) return 0, fmt.Errorf("flatten rootfs: %w", err) } // Clean up dm device and loop device now that flatten is complete. m.cleanupDM(sb) // Shrink the flattened image to its minimum size, then re-expand to the // configured default rootfs size so sandboxes see the full disk from boot. if out, err := exec.Command("e2fsck", "-fy", outputPath).CombinedOutput(); err != nil { if exitErr, ok := err.(*exec.ExitError); ok && exitErr.ExitCode() > 1 { slog.Warn("e2fsck before shrink failed (non-fatal)", "output", string(out), "error", err) } } if out, err := exec.Command("resize2fs", "-M", outputPath).CombinedOutput(); err != nil { slog.Warn("resize2fs -M failed (non-fatal)", "output", string(out), "error", err) } // Re-expand to default rootfs size. targetMB := m.cfg.DefaultRootfsSizeMB if targetMB <= 0 { targetMB = DefaultDiskSizeMB } if err := expandImage(outputPath, int64(targetMB)*1024*1024, targetMB); err != nil { slog.Warn("failed to expand template to default size (non-fatal)", "error", err) } sizeBytes, err := snapshot.DirSize(flattenDstDir, "") if err != nil { slog.Warn("failed to calculate template size", "error", err) } slog.Info("rootfs flattened to image-only template", "sandbox", sandboxID, "team_id", teamID, "template_id", templateID, "size_bytes", sizeBytes, ) return sizeBytes, nil } // cleanupDM tears down the dm-snapshot device and releases the base image loop device. func (m *Manager) cleanupDM(sb *sandboxState) { if sb.dmDevice != nil { if err := devicemapper.RemoveSnapshot(context.Background(), sb.dmDevice); err != nil { slog.Warn("dm-snapshot remove error", "id", sb.ID, "error", err) } os.Remove(sb.dmDevice.CowPath) } if sb.baseImagePath != "" { m.loops.Release(sb.baseImagePath) } } // DeleteSnapshot removes a snapshot template from disk. func (m *Manager) DeleteSnapshot(teamID, templateID pgtype.UUID) error { return os.RemoveAll(layout.TemplateDir(m.cfg.WrennDir, teamID, templateID)) } // createFromSnapshot creates a new sandbox by restoring from a snapshot template. // CH handles memory restore internally (with on-demand paging). // The template's rootfs.ext4 is a flattened standalone image — we create a // dm-snapshot on top of it just like a normal Create. func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID string, teamID, templateID pgtype.UUID, vcpus, memoryMB, timeoutSec, diskSizeMB int) (*models.Sandbox, error) { tmplDir := layout.TemplateDir(m.cfg.WrennDir, teamID, templateID) // Set up dm-snapshot on the template's flattened rootfs. baseRootfs := filepath.Join(tmplDir, snapshot.RootfsFileName) originLoop, err := m.loops.Acquire(baseRootfs) if err != nil { return nil, fmt.Errorf("acquire loop device: %w", err) } originSize, err := devicemapper.OriginSizeBytes(originLoop) if err != nil { m.loops.Release(baseRootfs) return nil, fmt.Errorf("get origin size: %w", err) } dmName := "wrenn-" + sandboxID cowPath := filepath.Join(layout.SandboxesDir(m.cfg.WrennDir), fmt.Sprintf("%s.cow", sandboxID)) cowSize := max(int64(diskSizeMB)*1024*1024, originSize) dmDev, err := devicemapper.CreateSnapshot(dmName, originLoop, cowPath, originSize, cowSize) if err != nil { m.loops.Release(baseRootfs) return nil, fmt.Errorf("create dm-snapshot: %w", err) } res := &createResources{ sandboxID: sandboxID, loops: m.loops, loopImage: baseRootfs, dmDevice: dmDev, cowPath: cowPath, slots: m.slots, } // Allocate network. slotIdx, err := m.slots.Allocate() if err != nil { res.rollback() return nil, fmt.Errorf("allocate network slot: %w", err) } res.slotIdx = slotIdx slot := network.NewSlot(slotIdx) if err := network.CreateNetwork(slot); err != nil { res.rollback() return nil, fmt.Errorf("create network: %w", err) } res.slot = slot // Restore VM from CH snapshot. vmCfg := vm.VMConfig{ SandboxID: sandboxID, TemplateID: id.UUIDString(templateID), KernelPath: m.cfg.KernelPath, RootfsPath: dmDev.DevicePath, VCPUs: vcpus, MemoryMB: memoryMB, NetworkNamespace: slot.NamespaceID, TapDevice: slot.TapName, TapMAC: slot.TapMAC, GuestIP: slot.GuestIP, GatewayIP: slot.TapIP, NetMask: slot.GuestNetMask, VMMBin: m.cfg.VMMBin, } if _, err := m.vm.CreateFromSnapshot(ctx, vmCfg, tmplDir); err != nil { res.rollback() return nil, fmt.Errorf("restore VM from snapshot: %w", err) } res.vm = m.vm // Wait for envd. client := envdclient.New(slot.HostIP.String()) waitCtx, waitCancel := context.WithTimeout(ctx, m.cfg.EnvdTimeout) if err := client.WaitUntilReady(waitCtx); err != nil { waitCancel() res.rollback() return nil, fmt.Errorf("wait for envd: %w", err) } waitCancel() // PostInit with sandbox_id and template_id so envd sets metadata. initCtx, initCancel := context.WithTimeout(ctx, m.cfg.EnvdTimeout) defer initCancel() if err := client.PostInitWithDefaults(initCtx, "", nil, sandboxID, id.UUIDString(templateID)); err != nil { slog.Warn("post-init failed after template restore, metadata may be stale", "sandbox", sandboxID, "error", err) } // Deflate balloon — template snapshot was taken with an inflated balloon. if err := m.vm.UpdateBalloon(ctx, sandboxID, 0); err != nil { slog.Debug("create-from-snapshot: balloon deflate failed (non-fatal)", "id", sandboxID, "error", err) } // Fetch envd version (best-effort). envdVersion, _ := client.FetchVersion(ctx) now := time.Now() sb := &sandboxState{ Sandbox: models.Sandbox{ ID: sandboxID, Status: models.StatusRunning, TemplateTeamID: teamID.Bytes, TemplateID: templateID.Bytes, VCPUs: vcpus, MemoryMB: memoryMB, TimeoutSec: timeoutSec, SlotIndex: slotIdx, HostIP: slot.HostIP, RootfsPath: dmDev.DevicePath, CreatedAt: now, LastActiveAt: now, Metadata: m.buildMetadata(envdVersion), }, slot: slot, client: client, connTracker: &ConnTracker{}, dmDevice: dmDev, baseImagePath: baseRootfs, } m.mu.Lock() m.boxes[sandboxID] = sb m.mu.Unlock() m.startSampler(sb) m.startCrashWatcher(sb) slog.Info("sandbox created from snapshot", "id", sandboxID, "team_id", teamID, "template_id", templateID, "host_ip", slot.HostIP.String(), "dm_device", dmDev.DevicePath, ) return &sb.Sandbox, nil } // Exec runs a command inside a sandbox. func (m *Manager) Exec(ctx context.Context, sandboxID string, cmd string, args []string, opts *envdclient.ExecOpts) (*envdclient.ExecResult, error) { sb, err := m.get(sandboxID) if err != nil { return nil, err } if sb.Status != models.StatusRunning { return nil, fmt.Errorf("sandbox %s is not running (status: %s)", sandboxID, sb.Status) } m.mu.Lock() sb.LastActiveAt = time.Now() m.mu.Unlock() return sb.client.Exec(ctx, cmd, args, opts) } // ExecStream runs a command inside a sandbox and returns a channel of streaming events. func (m *Manager) ExecStream(ctx context.Context, sandboxID string, cmd string, args ...string) (<-chan envdclient.ExecStreamEvent, error) { sb, err := m.get(sandboxID) if err != nil { return nil, err } if sb.Status != models.StatusRunning { return nil, fmt.Errorf("sandbox %s is not running (status: %s)", sandboxID, sb.Status) } m.mu.Lock() sb.LastActiveAt = time.Now() m.mu.Unlock() return sb.client.ExecStream(ctx, cmd, args...) } // List returns all sandboxes. func (m *Manager) List() []models.Sandbox { m.mu.RLock() defer m.mu.RUnlock() result := make([]models.Sandbox, 0, len(m.boxes)) for _, sb := range m.boxes { result = append(result, sb.Sandbox) } return result } // Get returns a sandbox by ID. func (m *Manager) Get(sandboxID string) (*models.Sandbox, error) { sb, err := m.get(sandboxID) if err != nil { return nil, err } return &sb.Sandbox, nil } // GetClient returns the envd client for a sandbox. func (m *Manager) GetClient(sandboxID string) (*envdclient.Client, error) { sb, err := m.get(sandboxID) if err != nil { return nil, err } if sb.Status != models.StatusRunning { return nil, fmt.Errorf("sandbox %s is not running (status: %s)", sandboxID, sb.Status) } return sb.client, nil } // SetDefaults calls envd's PostInit to configure the default user and // environment variables for a running sandbox. This is called by the host // agent after sandbox creation or resume when the template specifies defaults. func (m *Manager) SetDefaults(ctx context.Context, sandboxID, defaultUser string, defaultEnv map[string]string) error { if defaultUser == "" && len(defaultEnv) == 0 { return nil } sb, err := m.get(sandboxID) if err != nil { return err } if sb.Status != models.StatusRunning { return fmt.Errorf("sandbox %s is not running (status: %s)", sandboxID, sb.Status) } return sb.client.PostInitWithDefaults(ctx, defaultUser, defaultEnv, "", "") } // PtyAttach starts a new PTY process or reconnects to an existing one. // If cmd is non-empty, starts a new process. If empty, reconnects using tag. func (m *Manager) PtyAttach(ctx context.Context, sandboxID, tag, cmd string, args []string, cols, rows uint32, envs map[string]string, cwd string) (<-chan envdclient.PtyEvent, error) { sb, err := m.get(sandboxID) if err != nil { return nil, err } if sb.Status != models.StatusRunning { return nil, fmt.Errorf("sandbox %s is not running (status: %s)", sandboxID, sb.Status) } m.mu.Lock() sb.LastActiveAt = time.Now() m.mu.Unlock() if cmd != "" { return sb.client.PtyStart(ctx, tag, cmd, args, cols, rows, envs, cwd) } return sb.client.PtyConnect(ctx, tag) } // PtySendInput sends raw bytes to a PTY process in a sandbox. func (m *Manager) PtySendInput(ctx context.Context, sandboxID, tag string, data []byte) error { sb, err := m.get(sandboxID) if err != nil { return err } if sb.Status != models.StatusRunning { return fmt.Errorf("sandbox %s is not running (status: %s)", sandboxID, sb.Status) } m.mu.Lock() sb.LastActiveAt = time.Now() m.mu.Unlock() return sb.client.PtySendInput(ctx, tag, data) } // PtyResize updates the terminal dimensions for a PTY process in a sandbox. func (m *Manager) PtyResize(ctx context.Context, sandboxID, tag string, cols, rows uint32) error { sb, err := m.get(sandboxID) if err != nil { return err } if sb.Status != models.StatusRunning { return fmt.Errorf("sandbox %s is not running (status: %s)", sandboxID, sb.Status) } return sb.client.PtyResize(ctx, tag, cols, rows) } // PtyKill sends SIGKILL to a PTY process in a sandbox. func (m *Manager) PtyKill(ctx context.Context, sandboxID, tag string) error { sb, err := m.get(sandboxID) if err != nil { return err } if sb.Status != models.StatusRunning { return fmt.Errorf("sandbox %s is not running (status: %s)", sandboxID, sb.Status) } return sb.client.PtyKill(ctx, tag) } // StartBackground starts a background process inside a sandbox. func (m *Manager) StartBackground(ctx context.Context, sandboxID, tag, cmd string, args []string, envs map[string]string, cwd string) (uint32, error) { sb, err := m.get(sandboxID) if err != nil { return 0, err } if sb.Status != models.StatusRunning { return 0, fmt.Errorf("sandbox %s is not running (status: %s)", sandboxID, sb.Status) } m.mu.Lock() sb.LastActiveAt = time.Now() m.mu.Unlock() return sb.client.StartBackground(ctx, tag, cmd, args, envs, cwd) } // ConnectProcess re-attaches to a running process inside a sandbox. func (m *Manager) ConnectProcess(ctx context.Context, sandboxID string, pid uint32, tag string) (<-chan envdclient.ExecStreamEvent, error) { sb, err := m.get(sandboxID) if err != nil { return nil, err } if sb.Status != models.StatusRunning { return nil, fmt.Errorf("sandbox %s is not running (status: %s)", sandboxID, sb.Status) } m.mu.Lock() sb.LastActiveAt = time.Now() m.mu.Unlock() return sb.client.ConnectProcess(ctx, pid, tag) } // ListProcesses returns all running processes inside a sandbox. func (m *Manager) ListProcesses(ctx context.Context, sandboxID string) ([]envdclient.ProcessInfo, error) { sb, err := m.get(sandboxID) if err != nil { return nil, err } if sb.Status != models.StatusRunning { return nil, fmt.Errorf("sandbox %s is not running (status: %s)", sandboxID, sb.Status) } m.mu.Lock() sb.LastActiveAt = time.Now() m.mu.Unlock() return sb.client.ListProcesses(ctx) } // KillProcess sends a signal to a process inside a sandbox. func (m *Manager) KillProcess(ctx context.Context, sandboxID string, pid uint32, tag string, signal envdpb.Signal) error { sb, err := m.get(sandboxID) if err != nil { return err } if sb.Status != models.StatusRunning { return fmt.Errorf("sandbox %s is not running (status: %s)", sandboxID, sb.Status) } m.mu.Lock() sb.LastActiveAt = time.Now() m.mu.Unlock() return sb.client.KillProcess(ctx, pid, tag, signal) } // AcquireProxyConn atomically looks up a sandbox by ID and registers an // in-flight proxy connection. Returns the sandbox's host-reachable IP, the // connection tracker, and true on success. The caller must call // tracker.Release() when the request completes. Returns zero values and // false if the sandbox is not found, not running, or is draining for a pause. func (m *Manager) AcquireProxyConn(sandboxID string) (net.IP, *ConnTracker, bool) { m.mu.RLock() sb, ok := m.boxes[sandboxID] m.mu.RUnlock() if !ok || sb.Status != models.StatusRunning { return nil, nil, false } if !sb.connTracker.Acquire() { return nil, nil, false } return sb.HostIP, sb.connTracker, true } // Ping resets the inactivity timer for a running sandbox. func (m *Manager) Ping(sandboxID string) error { m.mu.Lock() defer m.mu.Unlock() sb, ok := m.boxes[sandboxID] if !ok { return fmt.Errorf("%w: %s", ErrNotFound, sandboxID) } if sb.Status != models.StatusRunning { return fmt.Errorf("sandbox %s is not running (status: %s)", sandboxID, sb.Status) } sb.LastActiveAt = time.Now() return nil } // DrainAutoPausedIDs returns and clears the list of sandbox IDs that were // automatically paused by the TTL reaper since the last call. func (m *Manager) DrainAutoPausedIDs() []string { m.autoPausedMu.Lock() defer m.autoPausedMu.Unlock() ids := m.autoPausedIDs m.autoPausedIDs = nil return ids } func (m *Manager) get(sandboxID string) (*sandboxState, error) { m.mu.RLock() defer m.mu.RUnlock() sb, ok := m.boxes[sandboxID] if !ok { return nil, fmt.Errorf("%w: %s", ErrNotFound, sandboxID) } return sb, nil } // StartTTLReaper starts a background goroutine that destroys sandboxes // that have exceeded their TTL (timeout_sec of inactivity). func (m *Manager) StartTTLReaper(ctx context.Context) { go func() { ticker := time.NewTicker(2 * time.Second) defer ticker.Stop() for { select { case <-ctx.Done(): return case <-m.stopCh: return case <-ticker.C: m.reapExpired(ctx) } } }() } func (m *Manager) reapExpired(_ context.Context) { m.mu.RLock() var expired []string now := time.Now() for id, sb := range m.boxes { if sb.TimeoutSec <= 0 { continue } if sb.Status != models.StatusRunning { continue } if now.Sub(sb.LastActiveAt) > time.Duration(sb.TimeoutSec)*time.Second { expired = append(expired, id) } } m.mu.RUnlock() for _, id := range expired { slog.Info("TTL expired, auto-pausing sandbox", "id", id) // Use a detached context so that an app shutdown does not cancel // a pause mid-flight, which would leave the VM frozen without a // valid snapshot. pauseCtx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) err := m.Pause(pauseCtx, id) cancel() if err != nil { slog.Warn("TTL auto-pause failed, destroying sandbox", "id", id, "error", err) if destroyErr := m.Destroy(context.Background(), id); destroyErr != nil { slog.Warn("TTL destroy after failed pause also failed", "id", id, "error", destroyErr) } else if m.eventSender != nil { m.eventSender.SendAsync(LifecycleEvent{ Event: "sandbox.stopped", SandboxID: id, }) } continue } m.autoPausedMu.Lock() m.autoPausedIDs = append(m.autoPausedIDs, id) m.autoPausedMu.Unlock() if m.eventSender != nil { m.eventSender.SendAsync(LifecycleEvent{ Event: "sandbox.auto_paused", SandboxID: id, }) } } } // Shutdown destroys all sandboxes, releases loop devices, and stops the TTL reaper. func (m *Manager) Shutdown(ctx context.Context) { close(m.stopCh) m.mu.Lock() ids := make([]string, 0, len(m.boxes)) for id := range m.boxes { ids = append(ids, id) } m.mu.Unlock() for _, sbID := range ids { slog.Info("shutdown: destroying sandbox", "id", sbID) if err := m.Destroy(ctx, sbID); err != nil { slog.Warn("shutdown destroy failed", "id", sbID, "error", err) } } m.loops.ReleaseAll() } // PauseAll pauses every running sandbox managed by this host agent. // Called when the host loses connectivity to the control plane to avoid // leaving running VMs unmanaged. It is best-effort: failures for individual // sandboxes are logged but do not stop the rest. func (m *Manager) PauseAll(ctx context.Context) { m.mu.RLock() ids := make([]string, 0, len(m.boxes)) for id, sb := range m.boxes { if sb.Status == models.StatusRunning { ids = append(ids, id) } } m.mu.RUnlock() slog.Info("pausing all running sandboxes due to CP connection loss", "count", len(ids)) for _, sbID := range ids { if err := m.Pause(ctx, sbID); err != nil { slog.Warn("PauseAll: failed to pause sandbox", "id", sbID, "error", err) } } } // warnErr logs a warning if err is non-nil. Used for best-effort cleanup // in error paths where the primary error has already been captured. func warnErr(msg string, id string, err error) { if err != nil { slog.Warn(msg, "id", id, "error", err) } } // createResources tracks partially-acquired resources during sandbox creation // so they can be rolled back in reverse order on failure. type createResources struct { sandboxID string loops *devicemapper.LoopRegistry vm *vm.Manager loopImage string dmDevice *devicemapper.SnapshotDevice cowPath string slotIdx int slots *network.SlotAllocator slot *network.Slot rollCow func() // optional custom cow rollback (e.g. rename back) } func (r *createResources) rollback() { if r.vm != nil && r.sandboxID != "" { warnErr("vm destroy error", r.sandboxID, r.vm.Destroy(context.Background(), r.sandboxID)) } if r.slot != nil { warnErr("network cleanup error", r.sandboxID, network.RemoveNetwork(r.slot)) } if r.slots != nil && r.slotIdx > 0 { r.slots.Release(r.slotIdx) } if r.dmDevice != nil { warnErr("dm-snapshot remove error", r.sandboxID, devicemapper.RemoveSnapshot(context.Background(), r.dmDevice)) } if r.rollCow != nil { r.rollCow() } else if r.cowPath != "" { os.Remove(r.cowPath) } if r.loopImage != "" { r.loops.Release(r.loopImage) } } // startCrashWatcher monitors the VM process for unexpected exits. // If the process exits while the sandbox is still in m.boxes (i.e. not a // deliberate Destroy), the sandbox is cleaned up and a sandbox.error event // is pushed to the control plane. func (m *Manager) startCrashWatcher(sb *sandboxState) { v, ok := m.vm.Get(sb.ID) if !ok { return } go func() { select { case <-v.Exited(): case <-m.stopCh: return } // Check if this was a deliberate Destroy/Pause (sandbox already removed // from boxes, or Pause owns the cleanup). m.mu.Lock() _, stillAlive := m.boxes[sb.ID] if stillAlive && sb.Status == models.StatusPausing { stillAlive = false } if stillAlive { delete(m.boxes, sb.ID) } m.mu.Unlock() if !stillAlive { return } slog.Error("VM process crashed, cleaning up", "id", sb.ID) sb.lifecycleMu.Lock() m.cleanupAfterCrash(sb) sb.lifecycleMu.Unlock() if m.onDestroy != nil { m.onDestroy(sb.ID) } if m.eventSender != nil { m.eventSender.SendAsync(LifecycleEvent{ Event: "sandbox.error", SandboxID: sb.ID, }) } }() } // cleanupAfterCrash tears down sandbox resources after a VM crash. // The VM process is already dead so we skip vm.Destroy and just clean up // network, device-mapper, and loop devices. func (m *Manager) cleanupAfterCrash(sb *sandboxState) { m.stopSampler(sb) // Remove the VM from the vm.Manager's map (process is already dead). _ = m.vm.Destroy(context.Background(), sb.ID) if err := network.RemoveNetwork(sb.slot); err != nil { slog.Warn("crash cleanup: network error", "id", sb.ID, "error", err) } m.slots.Release(sb.SlotIndex) if sb.dmDevice != nil { if err := devicemapper.RemoveSnapshot(context.Background(), sb.dmDevice); err != nil { slog.Warn("crash cleanup: dm-snapshot error", "id", sb.ID, "error", err) } os.Remove(sb.dmDevice.CowPath) } if sb.baseImagePath != "" { m.loops.Release(sb.baseImagePath) } } // startSampler resolves the VMM PID and starts a background goroutine // that samples CPU/mem/disk at 1s intervals into the ring buffer. // Must be called after the sandbox is registered in m.boxes. func (m *Manager) startSampler(sb *sandboxState) { v, ok := m.vm.Get(sb.ID) if !ok { slog.Warn("metrics: VM not found, skipping sampler", "id", sb.ID) return } // v.PID() is the cmd.Process.Pid of the "unshare -m -- bash -c script" // invocation. The exec chain (unshare → bash → ip netns exec → cloud-hypervisor) // occupies the same PID. v.PID() IS the VMM PID. vmmPID := v.PID() sb.vmmPID = vmmPID sb.ring = newMetricsRing() ctx, cancel := context.WithCancel(context.Background()) sb.samplerCancel = cancel sb.samplerDone = make(chan struct{}) // Read initial CPU counters for delta calculation. // Passed to goroutine as local state — no shared mutation. initialCPU, err := readCPUStat(vmmPID) if err != nil { slog.Warn("metrics: could not read initial CPU stat", "id", sb.ID, "error", err) } go m.samplerLoop(ctx, sb, vmmPID, sb.VCPUs, initialCPU) } // samplerLoop samples metrics at 1s intervals. // lastCPU is goroutine-local to avoid shared-state races. func (m *Manager) samplerLoop(ctx context.Context, sb *sandboxState, vmmPID, vcpus int, lastCPU cpuStat) { defer close(sb.samplerDone) ticker := time.NewTicker(1 * time.Second) defer ticker.Stop() clkTck := 100.0 // sysconf(_SC_CLK_TCK), almost always 100 on Linux lastTime := time.Now() cpuInitialized := lastCPU != (cpuStat{}) for { select { case <-ctx.Done(): return case now := <-ticker.C: elapsed := now.Sub(lastTime).Seconds() lastTime = now // CPU: delta jiffies / (elapsed * CLK_TCK * vcpus) * 100 var cpuPct float64 cur, err := readCPUStat(vmmPID) if err == nil { if cpuInitialized && elapsed > 0 && vcpus > 0 { deltaJiffies := float64((cur.utime + cur.stime) - (lastCPU.utime + lastCPU.stime)) cpuPct = (deltaJiffies / (elapsed * clkTck * float64(vcpus))) * 100.0 if cpuPct > 100.0 { cpuPct = 100.0 } if cpuPct < 0 { cpuPct = 0 } } lastCPU = cur cpuInitialized = true } // Memory: guest-reported used memory from envd /metrics. // VmRSS of the VMM process includes guest page cache and never // decreases, so we use the guest's own view which reports // total - available (actual process memory). memBytes, _ := readEnvdMemUsed(ctx, sb.client) // Disk: allocated bytes of the CoW sparse file. var diskBytes int64 if sb.dmDevice != nil { diskBytes, _ = readDiskAllocated(sb.dmDevice.CowPath) } sb.ring.Push(MetricPoint{ Timestamp: now, CPUPct: cpuPct, MemBytes: memBytes, DiskBytes: diskBytes, }) } } } // stopSampler stops the metrics sampling goroutine and waits for it to exit. func (m *Manager) stopSampler(sb *sandboxState) { if sb.samplerCancel != nil { sb.samplerCancel() <-sb.samplerDone sb.samplerCancel = nil } } // GetMetrics returns the ring buffer data for the given range tier. // Valid ranges: "10m", "2h", "24h". func (m *Manager) GetMetrics(sandboxID, rangeTier string) ([]MetricPoint, error) { m.mu.RLock() sb, ok := m.boxes[sandboxID] m.mu.RUnlock() if !ok { return nil, fmt.Errorf("%w: %s", ErrNotFound, sandboxID) } if sb.ring == nil { return nil, nil } // Map the requested range to the appropriate ring tier and time cutoff. var points []MetricPoint var cutoff time.Duration switch rangeTier { case "5m": points = sb.ring.Get10m() cutoff = 5 * time.Minute case "10m": points = sb.ring.Get10m() cutoff = 10 * time.Minute case "1h": points = sb.ring.Get2h() cutoff = 1 * time.Hour case "2h": points = sb.ring.Get2h() cutoff = 2 * time.Hour case "6h": points = sb.ring.Get24h() cutoff = 6 * time.Hour case "12h": points = sb.ring.Get24h() cutoff = 12 * time.Hour case "24h": points = sb.ring.Get24h() cutoff = 24 * time.Hour default: return nil, fmt.Errorf("invalid range: %s (valid: 5m, 10m, 1h, 2h, 6h, 12h, 24h)", rangeTier) } // Filter points to the requested time window. threshold := time.Now().Add(-cutoff) filtered := points[:0:0] for _, p := range points { if !p.Timestamp.Before(threshold) { filtered = append(filtered, p) } } return filtered, nil } // FlushMetrics returns all three tier ring buffers, clears the ring, and // stops the sampler goroutine. Called by the control plane before pause/destroy. func (m *Manager) FlushMetrics(sandboxID string) (pts10m, pts2h, pts24h []MetricPoint, err error) { m.mu.RLock() sb, ok := m.boxes[sandboxID] m.mu.RUnlock() if !ok { return nil, nil, nil, fmt.Errorf("%w: %s", ErrNotFound, sandboxID) } m.stopSampler(sb) if sb.ring == nil { return nil, nil, nil, nil } pts10m, pts2h, pts24h = sb.ring.Flush() return pts10m, pts2h, pts24h, nil } // copyFile copies a regular file from src to dst using streaming I/O. func copyFile(src, dst string) error { sf, err := os.Open(src) if err != nil { return fmt.Errorf("open %s: %w", src, err) } defer sf.Close() df, err := os.Create(dst) if err != nil { return fmt.Errorf("create %s: %w", dst, err) } defer df.Close() if _, err := df.ReadFrom(sf); err != nil { os.Remove(dst) return fmt.Errorf("copy %s → %s: %w", src, dst, err) } return nil }