package sandbox import ( "context" "errors" "fmt" "log/slog" "net" "os" "path/filepath" "sync" "sync/atomic" "syscall" "time" "github.com/jackc/pgx/v5/pgtype" "git.omukk.dev/wrenn/wrenn/internal/devicemapper" "git.omukk.dev/wrenn/wrenn/internal/envdclient" "git.omukk.dev/wrenn/wrenn/internal/layout" "git.omukk.dev/wrenn/wrenn/internal/models" "git.omukk.dev/wrenn/wrenn/internal/network" "git.omukk.dev/wrenn/wrenn/internal/vm" "git.omukk.dev/wrenn/wrenn/pkg/id" envdpb "git.omukk.dev/wrenn/wrenn/proto/envd/gen" ) // Sentinel errors. Use errors.Is to detect them rather than string-matching. var ( // ErrNotFound is returned when a sandbox is not present in the in-memory map. ErrNotFound = errors.New("sandbox not found") // ErrNotRunning is returned when an operation requires StatusRunning but // the sandbox is in another state (or its envd client has been cleared // concurrently by a pause). ErrNotRunning = errors.New("sandbox not running") // ErrNotPaused is returned when an operation requires StatusPaused but // the sandbox is in another state. ErrNotPaused = errors.New("sandbox not paused") // ErrInvalidRange is returned when a metrics range parameter is invalid. ErrInvalidRange = errors.New("invalid range") ) // MinTimeoutSec is the minimum inactivity TTL accepted by Create/Resume. // 0 keeps the "no TTL" semantic; any positive value below this is clamped. // // Rationale: very short TTLs race the post-create/post-resume startup window // (m.boxes insertion → /init → startMemoryLoader). With memLoadDone unset // for a brief moment, the reaper guard does not fire and a sub-second // TimeoutSec could auto-pause a sandbox before its memory loader arms, // producing a stale ch.snapshot. 60s is well above the startup envelope. const MinTimeoutSec = 60 // clampTimeout normalises a caller-supplied TTL. 0 means "no TTL" and is // preserved; positive values are floored at MinTimeoutSec. func clampTimeout(timeoutSec int) int { if timeoutSec <= 0 { return 0 } if timeoutSec < MinTimeoutSec { return MinTimeoutSec } return timeoutSec } // envdReadyTimeoutFloor is the minimum time to wait for envd's /health to // answer after a fresh boot or restore. const envdReadyTimeoutFloor = 120 * time.Second // envdReadyTimeoutPerGB scales the wait budget with guest RAM: larger VMs // take longer to cold-boot (struct-page init, multi-vCPU bringup, cold // dm-snapshot I/O). const envdReadyTimeoutPerGB = 8 * time.Second // envdReadyTimeout returns the WaitUntilReady deadline for a VM with the given // memory size: 8s per GiB of RAM, floored at 120s. A 20 GiB guest gets 160s. func envdReadyTimeout(memoryMB int) time.Duration { gb := (memoryMB + 1023) / 1024 // round up scaled := time.Duration(gb) * envdReadyTimeoutPerGB if scaled < envdReadyTimeoutFloor { return envdReadyTimeoutFloor } return scaled } // Config holds the paths and defaults for the sandbox manager. type Config struct { WrennDir string // root directory (e.g. /var/lib/wrenn); all sub-paths derived via layout package EnvdTimeout time.Duration DefaultRootfsSizeMB int // target size for template rootfs images; 0 → DefaultDiskSizeMB // Resolved at startup by the host agent. KernelPath string // path to the latest vmlinux-x.y.z KernelVersion string // semver extracted from filename VMMBin string // path to the cloud-hypervisor binary VMMVersion string // semver from cloud-hypervisor --version AgentVersion string // host agent version (injected via ldflags) } // LifecycleEvent describes an autonomous state change initiated by the agent. type LifecycleEvent struct { Event string SandboxID string } // EventSender sends autonomous lifecycle events to the control plane. // SendAsync is fire-and-forget; Send blocks with retries and returns the // final error so callers running under a shutdown deadline can guarantee // delivery before process exit. type EventSender interface { SendAsync(event LifecycleEvent) Send(ctx context.Context, event LifecycleEvent) error } // ErrDraining is returned by Create / Resume when the manager has begun // shutdown. The agent process is about to pause every running sandbox and // exit; admitting new lifecycle work would race the destroy loop and leave // orphaned VMs after the process is gone. var ErrDraining = errors.New("agent is draining for shutdown") // Manager orchestrates sandbox lifecycle: VM, network, filesystem, envd. type Manager struct { cfg Config vm *vm.Manager slots *network.SlotAllocator loops *devicemapper.LoopRegistry mu sync.RWMutex boxes map[string]*sandboxState stopCh chan struct{} // draining is set at the start of Shutdown. Create and Resume check it // (atomically, no lock needed) and refuse new work so the destroy loop // can run to completion without racing fresh RPCs. draining atomic.Bool // creates tracks in-flight Create calls by sandbox ID. An entry exists // only while Create is acquiring resources / booting the VM, before the // sandbox lands in boxes. Destroy consults it to abort a create that // would otherwise leak its half-built VM. Guarded by mu. creates map[string]*createHandle // onDestroy is called with the sandbox ID after cleanup completes. // Used by ProxyHandler to evict cached reverse proxies. onDestroy func(sandboxID string) // eventSender sends autonomous lifecycle events (auto-pause, auto-destroy) // to the CP via HTTP callback. Optional — nil means events are only // propagated through the HostMonitor reconciler. eventSender EventSender } // SetOnDestroy registers a callback invoked after each sandbox is cleaned up. func (m *Manager) SetOnDestroy(fn func(sandboxID string)) { m.onDestroy = fn } // SetEventSender registers the callback sender for autonomous lifecycle events. func (m *Manager) SetEventSender(sender EventSender) { m.eventSender = sender } // sandboxState holds the runtime state for a single sandbox. type sandboxState struct { models.Sandbox lifecycleMu sync.Mutex // serializes Pause/Destroy/Resume on this sandbox slot *network.Slot // client is published via atomic.Pointer so Exec/Pty/Process callers can // load it without holding lifecycleMu. Pause's releaseRuntime stores nil; // Resume stores a fresh client. Callers MUST nil-check after Load. client atomic.Pointer[envdclient.Client] connTracker *ConnTracker // tracks in-flight proxy connections for pre-pause drain dmDevice *devicemapper.SnapshotDevice baseImagePath string // path to the base template rootfs (for loop registry release) // sandboxDirOverride, when non-empty, pins this sandbox's VMConfig.SandboxDir // to a path other than the default vm.SandboxTmpDir(sb.ID). Set when the // sandbox was launched from a snapshot template — CH's saved config.json // hardcodes the *original* source sandbox's tmpfs path, so every subsequent // restore (Resume, PauseAll/restart) must reuse that same path or CH cannot // find rootfs.ext4 in the new mount namespace. sandboxDirOverride string // Background memory loading state (set during Resume for UFFD sandboxes). // nil for freshly-created sandboxes. For resumed sandboxes, memLoadDone // is closed when the background loader finishes (success or failure). memLoadDone chan struct{} // closed when background memory loader exits memLoadCancel context.CancelFunc // cancels the background loader goroutine // Metrics sampling state. vmmPID int // VMM process PID (child of unshare wrapper) ring *metricsRing // tiered ring buffers for CPU/mem/disk metrics samplerCancel context.CancelFunc // cancels the per-sandbox sampling goroutine samplerDone chan struct{} // closed when the sampling goroutine exits } // buildMetadata constructs the metadata map with version information. func (m *Manager) buildMetadata(envdVersion string) map[string]string { meta := map[string]string{ "kernel_version": m.cfg.KernelVersion, "vmm_version": m.cfg.VMMVersion, "agent_version": m.cfg.AgentVersion, } if envdVersion != "" { meta["envd_version"] = envdVersion } return meta } // createHandle coordinates an in-flight Create with a concurrent Destroy. // cancel aborts the creation context; done is closed once Create has fully // finished — whether it succeeded or rolled back a partial failure. type createHandle struct { cancel context.CancelFunc done chan struct{} } // New creates a new sandbox manager. func New(cfg Config) *Manager { if cfg.EnvdTimeout == 0 { cfg.EnvdTimeout = 30 * time.Second } return &Manager{ cfg: cfg, vm: vm.NewManager(), slots: network.NewSlotAllocator(), loops: devicemapper.NewLoopRegistry(), boxes: make(map[string]*sandboxState), creates: make(map[string]*createHandle), stopCh: make(chan struct{}), } } // TemplateRootfsSize returns the actual disk usage of a template's rootfs // file on this host. Uses block-level accounting (stat.Blocks * 512) so // sparse files (even after EnsureImageSizes expansion) report only the // blocks that are actually allocated on disk. func (m *Manager) TemplateRootfsSize(teamID, templateID pgtype.UUID) (int64, error) { path := layout.TemplateRootfs(m.cfg.WrennDir, teamID, templateID) info, err := os.Stat(path) if err != nil { return 0, fmt.Errorf("stat template rootfs: %w", err) } if sys, ok := info.Sys().(*syscall.Stat_t); ok { return sys.Blocks * 512, nil } return info.Size(), nil } // Create boots a new sandbox. If the template's TemplateDir contains a CH // memory snapshot (state.json + config.json) it is restored via CH's // --restore + UFFD lazy memory; otherwise a fresh boot from the flattened // rootfs is performed. defaultUser/defaultEnv are forwarded to envd's /init // in both paths. // // If sandboxID is empty, a new ID is generated. func (m *Manager) Create( ctx context.Context, sandboxID string, teamID, templateID pgtype.UUID, vcpus, memoryMB, timeoutSec, diskSizeMB int, defaultUser string, defaultEnv map[string]string, ) (*models.Sandbox, int64, error) { if m.draining.Load() { return nil, 0, ErrDraining } if sandboxID == "" { sandboxID = id.FormatSandboxID(id.NewSandboxID()) } if vcpus <= 0 { vcpus = 1 } if memoryMB <= 0 { memoryMB = 512 } if diskSizeMB <= 0 { diskSizeMB = m.cfg.DefaultRootfsSizeMB } timeoutSec = clampTimeout(timeoutSec) // Register an in-flight create handle before acquiring any resources so a // concurrent Destroy can abort this creation and wait for its rollback. // Without this, a Destroy that arrives while the VM is still booting finds // nothing in m.boxes, no-ops, and Create races on to register a VM that no // caller owns — a permanent VM / dm / network / loop leak. createCtx, cancelCreate := context.WithCancel(ctx) handle := &createHandle{cancel: cancelCreate, done: make(chan struct{})} m.mu.Lock() if _, exists := m.boxes[sandboxID]; exists { m.mu.Unlock() cancelCreate() return nil, 0, fmt.Errorf("sandbox %s already exists", sandboxID) } if _, inflight := m.creates[sandboxID]; inflight { m.mu.Unlock() cancelCreate() return nil, 0, fmt.Errorf("sandbox %s create already in progress", sandboxID) } m.creates[sandboxID] = handle m.mu.Unlock() defer func() { m.mu.Lock() delete(m.creates, sandboxID) m.mu.Unlock() cancelCreate() close(handle.done) }() // All subsequent steps run under the cancellable create context so a // concurrent Destroy can interrupt a slow VM boot / envd readiness wait. ctx = createCtx // Snapshot template? Route to the CH-restore path; the launcher manages // its own resource lifecycle and registers the sandbox itself. // // System base templates never carry a memory snapshot; guarding here // prevents a stray state.json (e.g. from a failed CreateSnapshot that // mis-targeted a base template) from silently rerouting fresh boots into // the restore path with a confusing error downstream. templateDir := layout.TemplateDir(m.cfg.WrennDir, teamID, templateID) if !layout.IsSystemTemplate(teamID, templateID) && layout.IsSnapshotTemplate(templateDir) { return m.createFromSnapshotTemplate(ctx, sandboxID, teamID, templateID, vcpus, memoryMB, timeoutSec, diskSizeMB, defaultUser, defaultEnv) } // Resolve base rootfs image. baseRootfs := layout.TemplateRootfs(m.cfg.WrennDir, teamID, templateID) if _, err := os.Stat(baseRootfs); err != nil { return nil, 0, fmt.Errorf("base rootfs not found at %s: %w", baseRootfs, err) } // Acquire shared read-only loop device for the base image. originLoop, err := m.loops.Acquire(baseRootfs) if err != nil { return nil, 0, fmt.Errorf("acquire loop device: %w", err) } originSize, err := devicemapper.OriginSizeBytes(originLoop) if err != nil { m.loops.Release(baseRootfs) return nil, 0, fmt.Errorf("get origin size: %w", err) } // Create dm-snapshot with per-sandbox CoW file. // CoW must be at least as large as the origin — if every block is // rewritten, the CoW stores a full copy. Undersized CoW causes // dm-snapshot invalidation → EIO on all guest I/O. dmName := "wrenn-" + sandboxID if err := os.MkdirAll(layout.SandboxDir(m.cfg.WrennDir, sandboxID), 0o755); err != nil { m.loops.Release(baseRootfs) return nil, 0, fmt.Errorf("create sandbox dir: %w", err) } cowPath := layout.SandboxCowPath(m.cfg.WrennDir, sandboxID) cowSize := max(int64(diskSizeMB)*1024*1024, originSize) dmDev, err := devicemapper.CreateSnapshot(dmName, originLoop, cowPath, originSize, cowSize) if err != nil { m.loops.Release(baseRootfs) return nil, 0, fmt.Errorf("create dm-snapshot: %w", err) } res := &createResources{ sandboxID: sandboxID, loops: m.loops, loopImage: baseRootfs, dmDevice: dmDev, cowPath: cowPath, slots: m.slots, } // Allocate network slot. slotIdx, err := m.slots.Allocate() if err != nil { res.rollback() return nil, 0, fmt.Errorf("allocate network slot: %w", err) } res.slotIdx = slotIdx slot := network.NewSlot(slotIdx) // Set up network. if err := network.CreateNetwork(slot); err != nil { res.rollback() return nil, 0, fmt.Errorf("create network: %w", err) } res.slot = slot // Boot VM — CH gets the dm device path. vmCfg := vm.VMConfig{ SandboxID: sandboxID, TemplateID: id.UUIDString(templateID), KernelPath: m.cfg.KernelPath, RootfsPath: dmDev.DevicePath, VCPUs: vcpus, MemoryMB: memoryMB, NetworkNamespace: slot.NamespaceID, TapDevice: slot.TapName, TapMAC: slot.TapMAC, GuestIP: slot.GuestIP, GatewayIP: slot.TapIP, NetMask: slot.GuestNetMask, VMMBin: m.cfg.VMMBin, LogDir: filepath.Join(m.cfg.WrennDir, "logs"), } if _, err := m.vm.Create(ctx, vmCfg); err != nil { res.rollback() return nil, 0, fmt.Errorf("create VM: %w", err) } res.vm = m.vm // Wait for envd to be ready. The budget scales with guest RAM — a large // VM cold-boots slower than the minimal default. client := envdclient.New(slot.HostIP.String()) waitCtx, waitCancel := context.WithTimeout(ctx, envdReadyTimeout(memoryMB)) defer waitCancel() if err := client.WaitUntilReady(waitCtx); err != nil { res.rollback() return nil, 0, fmt.Errorf("wait for envd: %w", err) } // Fetch envd version (best-effort). envdVersion, _ := client.FetchVersion(ctx) // Apply template defaults via envd /init (no-op when both empty). if defaultUser != "" || len(defaultEnv) > 0 { initCtx, initCancel := context.WithTimeout(ctx, m.cfg.EnvdTimeout) if err := client.PostInitWithDefaults(initCtx, defaultUser, defaultEnv, sandboxID, id.UUIDString(templateID)); err != nil { slog.Warn("post-create PostInit failed", "id", sandboxID, "error", err) } initCancel() } now := time.Now() sb := &sandboxState{ Sandbox: models.Sandbox{ ID: sandboxID, Status: models.StatusRunning, TemplateTeamID: teamID.Bytes, TemplateID: templateID.Bytes, VCPUs: vcpus, MemoryMB: memoryMB, TimeoutSec: timeoutSec, SlotIndex: slotIdx, HostIP: slot.HostIP, RootfsPath: dmDev.DevicePath, CreatedAt: now, LastActiveAt: now, Metadata: m.buildMetadata(envdVersion), }, slot: slot, connTracker: &ConnTracker{}, dmDevice: dmDev, baseImagePath: baseRootfs, } sb.client.Store(client) m.mu.Lock() m.boxes[sandboxID] = sb m.mu.Unlock() m.startSampler(sb) m.startCrashWatcher(sb) slog.Info("sandbox created", "id", sandboxID, "team_id", teamID, "template_id", templateID, "host_ip", slot.HostIP.String(), "dm_device", dmDev.DevicePath, ) return &sb.Sandbox, cowSize, nil } // Destroy stops and cleans up a sandbox. If the sandbox is running, its VM, // network, and rootfs are torn down. Any pause snapshot files are also removed. func (m *Manager) Destroy(ctx context.Context, sandboxID string) error { m.mu.Lock() if handle, inflight := m.creates[sandboxID]; inflight { // A create is still in flight. Cancel it and wait for its rollback to // finish, otherwise the half-built VM / dm-snapshot / network / loop // device it acquired would leak with no owner. If the create instead // raced to success, it will have registered the sandbox in m.boxes by // the time done is closed — the normal teardown below then runs. m.mu.Unlock() slog.Info("destroy: aborting in-flight sandbox create", "id", sandboxID) handle.cancel() <-handle.done m.mu.Lock() } sb, ok := m.boxes[sandboxID] // statusAtEntry distinguishes "user is destroying an already-paused // sandbox" (legitimate cleanup → fall through) from "user is destroying // a running sandbox that raced to Paused before we got lifecycleMu" // (preserve snapshot → re-insert and bail). Captured under m.mu so it // reflects the same generation as the boxes-map delete. var statusAtEntry models.SandboxStatus if ok { statusAtEntry = sb.Status delete(m.boxes, sandboxID) } m.mu.Unlock() if ok { // Wait for any in-progress Pause to finish before tearing down resources. sb.lifecycleMu.Lock() defer sb.lifecycleMu.Unlock() // Racing-Pause guard. Only fires when the sandbox was NOT paused at // entry but became paused while we waited for lifecycleMu — i.e. a // concurrent Pause completed under us. In that case the snapshot was // just written to disk and destroying now would wipe a freshly-paused // sandbox. Re-insert into m.boxes (releaseRuntime already cleared // runtime refs; slot reservation retained for Resume) and return nil // so the agent's view stays consistent with the on-disk state. // // A legitimate Destroy of an already-paused sandbox (statusAtEntry == // Paused) falls through to cleanup, which releases the slot and // removes the snapshot dir — the user explicitly asked for deletion. if statusAtEntry != models.StatusPaused && sb.Status == models.StatusPaused { m.mu.Lock() m.boxes[sandboxID] = sb m.mu.Unlock() slog.Info("destroy: racing pause completed, preserving snapshot", "id", sandboxID) return nil } m.cleanup(ctx, sb) } // Always clean up pause snapshot files (may exist if sandbox was paused). if err := os.RemoveAll(layout.PauseSnapshotDir(m.cfg.WrennDir, sandboxID)); err != nil { slog.Warn("snapshot cleanup error", "id", sandboxID, "error", err) } if m.onDestroy != nil { m.onDestroy(sandboxID) } slog.Info("sandbox destroyed", "id", sandboxID) return nil } // cleanup tears down all resources for a sandbox. func (m *Manager) cleanup(ctx context.Context, sb *sandboxState) { if sb.memLoadCancel != nil { sb.memLoadCancel() if sb.memLoadDone != nil { <-sb.memLoadDone } } m.stopSampler(sb) if err := m.vm.Destroy(ctx, sb.ID); err != nil { slog.Warn("vm destroy error", "id", sb.ID, "error", err) } if err := network.RemoveNetwork(sb.slot); err != nil { slog.Warn("network cleanup error", "id", sb.ID, "error", err) } m.slots.Release(sb.SlotIndex) // Tear down dm-snapshot and release the base image loop device. if sb.dmDevice != nil { if err := devicemapper.RemoveSnapshot(context.Background(), sb.dmDevice); err != nil { slog.Warn("dm-snapshot remove error", "id", sb.ID, "error", err) } os.Remove(sb.dmDevice.CowPath) } // Paused branch: dm-snapshot and loop were already released by // releaseRuntime; the CoW file inside the sandbox dir is removed by // Destroy's os.RemoveAll(SandboxDir) below. if sb.baseImagePath != "" { m.loops.Release(sb.baseImagePath) } } // Pause, Resume, CreateSnapshot, FlattenRootfs, DeleteSnapshot, PauseAll // are implemented in pause.go. // activeClient resolves sandboxID to its envd client when the sandbox is in // StatusRunning and the client has not been cleared by a concurrent pause. // It bumps LastActiveAt as a side effect. Returns ErrNotFound if missing or // ErrNotRunning (wrapped with context) otherwise. // // All Exec/Pty/Process methods funnel through this — it is the single // chokepoint that guarantees we never deref a stale sb.client. func (m *Manager) activeClient(sandboxID string) (*envdclient.Client, error) { sb, err := m.get(sandboxID) if err != nil { return nil, err } if sb.Status != models.StatusRunning { return nil, fmt.Errorf("%w: %s (status: %s)", ErrNotRunning, sandboxID, sb.Status) } c := sb.client.Load() if c == nil { // Race: status flipped from Running between m.get and Load (pause's // releaseRuntime cleared the pointer). return nil, fmt.Errorf("%w: %s (client cleared)", ErrNotRunning, sandboxID) } m.mu.Lock() sb.LastActiveAt = time.Now() m.mu.Unlock() return c, nil } // Exec runs a command inside a sandbox. func (m *Manager) Exec(ctx context.Context, sandboxID string, cmd string, args []string, opts *envdclient.ExecOpts) (*envdclient.ExecResult, error) { c, err := m.activeClient(sandboxID) if err != nil { return nil, err } return c.Exec(ctx, cmd, args, opts) } // ExecStream runs a command inside a sandbox and returns a channel of streaming events. func (m *Manager) ExecStream(ctx context.Context, sandboxID string, cmd string, args ...string) (<-chan envdclient.ExecStreamEvent, error) { c, err := m.activeClient(sandboxID) if err != nil { return nil, err } return c.ExecStream(ctx, cmd, args...) } // List returns all sandboxes. func (m *Manager) List() []models.Sandbox { m.mu.RLock() defer m.mu.RUnlock() result := make([]models.Sandbox, 0, len(m.boxes)) for _, sb := range m.boxes { result = append(result, sb.Sandbox) } return result } // Get returns a sandbox by ID. func (m *Manager) Get(sandboxID string) (*models.Sandbox, error) { sb, err := m.get(sandboxID) if err != nil { return nil, err } return &sb.Sandbox, nil } // GetClient returns the envd client for a sandbox without bumping // LastActiveAt. Used by the proxy path which has its own activity bookkeeping. func (m *Manager) GetClient(sandboxID string) (*envdclient.Client, error) { sb, err := m.get(sandboxID) if err != nil { return nil, err } if sb.Status != models.StatusRunning { return nil, fmt.Errorf("%w: %s (status: %s)", ErrNotRunning, sandboxID, sb.Status) } c := sb.client.Load() if c == nil { return nil, fmt.Errorf("%w: %s (client cleared)", ErrNotRunning, sandboxID) } return c, nil } // SetDefaults calls envd's PostInit to configure the default user and // environment variables for a running sandbox. This is called by the host // agent after sandbox creation or resume when the template specifies defaults. func (m *Manager) SetDefaults(ctx context.Context, sandboxID, defaultUser string, defaultEnv map[string]string) error { if defaultUser == "" && len(defaultEnv) == 0 { return nil } c, err := m.activeClient(sandboxID) if err != nil { return err } return c.PostInitWithDefaults(ctx, defaultUser, defaultEnv, "", "") } // PtyAttach starts a new PTY process or reconnects to an existing one. // If cmd is non-empty, starts a new process. If empty, reconnects using tag. func (m *Manager) PtyAttach(ctx context.Context, sandboxID, tag, cmd string, args []string, cols, rows uint32, envs map[string]string, cwd string) (<-chan envdclient.PtyEvent, error) { c, err := m.activeClient(sandboxID) if err != nil { return nil, err } if cmd != "" { return c.PtyStart(ctx, tag, cmd, args, cols, rows, envs, cwd) } return c.PtyConnect(ctx, tag) } // PtySendInput sends raw bytes to a PTY process in a sandbox. func (m *Manager) PtySendInput(ctx context.Context, sandboxID, tag string, data []byte) error { c, err := m.activeClient(sandboxID) if err != nil { return err } return c.PtySendInput(ctx, tag, data) } // PtyResize updates the terminal dimensions for a PTY process in a sandbox. func (m *Manager) PtyResize(ctx context.Context, sandboxID, tag string, cols, rows uint32) error { c, err := m.activeClient(sandboxID) if err != nil { return err } return c.PtyResize(ctx, tag, cols, rows) } // PtyKill sends SIGKILL to a PTY process in a sandbox. func (m *Manager) PtyKill(ctx context.Context, sandboxID, tag string) error { c, err := m.activeClient(sandboxID) if err != nil { return err } return c.PtyKill(ctx, tag) } // StartBackground starts a background process inside a sandbox. func (m *Manager) StartBackground(ctx context.Context, sandboxID, tag, cmd string, args []string, envs map[string]string, cwd string) (uint32, error) { c, err := m.activeClient(sandboxID) if err != nil { return 0, err } return c.StartBackground(ctx, tag, cmd, args, envs, cwd) } // ConnectProcess re-attaches to a running process inside a sandbox. func (m *Manager) ConnectProcess(ctx context.Context, sandboxID string, pid uint32, tag string) (<-chan envdclient.ExecStreamEvent, error) { c, err := m.activeClient(sandboxID) if err != nil { return nil, err } return c.ConnectProcess(ctx, pid, tag) } // ListProcesses returns all running processes inside a sandbox. func (m *Manager) ListProcesses(ctx context.Context, sandboxID string) ([]envdclient.ProcessInfo, error) { c, err := m.activeClient(sandboxID) if err != nil { return nil, err } return c.ListProcesses(ctx) } // KillProcess sends a signal to a process inside a sandbox. func (m *Manager) KillProcess(ctx context.Context, sandboxID string, pid uint32, tag string, signal envdpb.Signal) error { c, err := m.activeClient(sandboxID) if err != nil { return err } return c.KillProcess(ctx, pid, tag, signal) } // AcquireProxyConn atomically looks up a sandbox by ID and registers an // in-flight proxy connection. Returns the sandbox's host-reachable IP, the // connection tracker, and true on success. The caller must call // tracker.Release() when the request completes. Returns zero values and // false if the sandbox is not found, not running, or is draining for a pause. func (m *Manager) AcquireProxyConn(sandboxID string) (net.IP, *ConnTracker, bool) { m.mu.RLock() sb, ok := m.boxes[sandboxID] m.mu.RUnlock() if !ok || sb.Status != models.StatusRunning { return nil, nil, false } if !sb.connTracker.Acquire() { return nil, nil, false } return sb.HostIP, sb.connTracker, true } // Ping resets the inactivity timer for a running sandbox. func (m *Manager) Ping(sandboxID string) error { m.mu.Lock() defer m.mu.Unlock() sb, ok := m.boxes[sandboxID] if !ok { return fmt.Errorf("%w: %s", ErrNotFound, sandboxID) } if sb.Status != models.StatusRunning { return fmt.Errorf("%w: %s (status: %s)", ErrNotRunning, sandboxID, sb.Status) } sb.LastActiveAt = time.Now() return nil } // DrainAutoPausedIDs returns IDs that auto-paused since the last drain. // The autonomous pause paths (TTL reaper, PauseAll on shutdown / heartbeat // failure) emit per-sandbox events through eventSender directly, so this // list is currently unused. Retained for proto compatibility. func (m *Manager) DrainAutoPausedIDs() []string { return nil } func (m *Manager) get(sandboxID string) (*sandboxState, error) { m.mu.RLock() defer m.mu.RUnlock() sb, ok := m.boxes[sandboxID] if !ok { return nil, fmt.Errorf("%w: %s", ErrNotFound, sandboxID) } return sb, nil } // StartTTLReaper starts a background goroutine that destroys sandboxes // that have exceeded their TTL (timeout_sec of inactivity). func (m *Manager) StartTTLReaper(ctx context.Context) { go func() { ticker := time.NewTicker(2 * time.Second) defer ticker.Stop() for { select { case <-ctx.Done(): return case <-m.stopCh: return case <-ticker.C: m.reapExpired(ctx) } } }() } func (m *Manager) reapExpired(_ context.Context) { m.mu.RLock() var expired []string now := time.Now() for id, sb := range m.boxes { if sb.TimeoutSec <= 0 { continue } if sb.Status != models.StatusRunning { continue } // Skip sandboxes still loading memory — they're initializing. if sb.memLoadDone != nil { select { case <-sb.memLoadDone: default: continue } } if now.Sub(sb.LastActiveAt) > time.Duration(sb.TimeoutSec)*time.Second { expired = append(expired, id) } } m.mu.RUnlock() for _, id := range expired { slog.Info("TTL expired, auto-pausing sandbox", "id", id) pauseCtx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) err := m.Pause(pauseCtx, id) cancel() if err != nil { slog.Warn("TTL auto-pause failed, destroying sandbox", "id", id, "error", err) if destroyErr := m.Destroy(context.Background(), id); destroyErr != nil { slog.Warn("TTL destroy after failed pause also failed", "id", id, "error", destroyErr) } else if m.eventSender != nil { m.eventSender.SendAsync(LifecycleEvent{ Event: "sandbox.stopped", SandboxID: id, }) } continue } if m.eventSender != nil { m.eventSender.SendAsync(LifecycleEvent{ Event: "sandbox.auto_paused", SandboxID: id, }) } } } // Shutdown gracefully drains the manager. Running sandboxes are paused so // their state survives across agent restarts; any sandboxes still holding // runtime resources after PauseAll (e.g. paused failed, or status was // Starting/Resuming/Error) are destroyed to release VM / dm / loop / netns. // Finally the shared loop registry is fully released. func (m *Manager) Shutdown(ctx context.Context) { // Flip draining BEFORE close(stopCh) so any Create/Resume already inside // its handler-goroutine sees the flag on its next check. Subsequent RPC // handlers that load the flag get ErrDraining and return immediately. m.draining.Store(true) close(m.stopCh) // Cancel in-flight Create calls and wait for them to settle. A slow create // (envd readiness wait scales up to ~160s for large VMs) would otherwise // register its VM in m.boxes after the destroy loop below has run, leaking // it. After the wait each create has either rolled back or registered in // m.boxes — where PauseAll / the destroy loop pick it up. m.mu.Lock() inflight := make([]*createHandle, 0, len(m.creates)) for _, h := range m.creates { h.cancel() inflight = append(inflight, h) } m.mu.Unlock() for _, h := range inflight { <-h.done } // Snapshot every running sandbox. PauseAll calls Pause per-sandbox which // internally calls releaseRuntime → frees VM, network, dm-snapshot, and // the base-image loop refcount. slog.Info("shutdown: pausing running sandboxes") m.PauseAll(ctx) // Destroy anything still holding runtime resources. A Paused sandbox has // already had releaseRuntime called, so re-destroying it is harmless but // also unnecessary — we destroy regardless to remove it from the boxes // map and to handle states where Pause failed or wasn't applicable. m.mu.RLock() ids := make([]string, 0, len(m.boxes)) for id, sb := range m.boxes { // Paused sandboxes already had runtime freed by PauseAll. Leave the // snapshot dir on disk so the next agent instance can resume them. if sb.Status == models.StatusPaused { continue } ids = append(ids, id) } m.mu.RUnlock() for _, sbID := range ids { slog.Info("shutdown: destroying sandbox", "id", sbID) if err := m.Destroy(ctx, sbID); err != nil { slog.Warn("shutdown destroy failed", "id", sbID, "error", err) continue } // Notify CP so the DB row flips off running/pausing/error to stopped. // Async: a sync Send with CP unreachable can burn ~31s per sandbox // (3 × 10s HTTP timeout + backoff) and blow the 5min shutdown budget. // Best-effort — if the agent process exits before the goroutine's // HTTP request lands, HostMonitor's missing-confirmed-dead reconcile // catches it after the next agent restart (it sees the sandbox in DB // as 'running'/'missing' but not present in ListSandboxes → stopped). if m.eventSender != nil { m.eventSender.SendAsync(LifecycleEvent{ Event: "sandbox.stopped", SandboxID: sbID, }) } } m.loops.ReleaseAll() } // warnErr logs a warning if err is non-nil. Used for best-effort cleanup // in error paths where the primary error has already been captured. func warnErr(msg string, id string, err error) { if err != nil { slog.Warn(msg, "id", id, "error", err) } } // createResources tracks partially-acquired resources during sandbox creation // so they can be rolled back in reverse order on failure. type createResources struct { sandboxID string loops *devicemapper.LoopRegistry vm *vm.Manager loopImage string dmDevice *devicemapper.SnapshotDevice cowPath string slotIdx int slots *network.SlotAllocator slot *network.Slot rollCow func() // optional custom cow rollback (e.g. rename back) } func (r *createResources) rollback() { if r.vm != nil && r.sandboxID != "" { warnErr("vm destroy error", r.sandboxID, r.vm.Destroy(context.Background(), r.sandboxID)) } if r.slot != nil { warnErr("network cleanup error", r.sandboxID, network.RemoveNetwork(r.slot)) } if r.slots != nil && r.slotIdx > 0 { r.slots.Release(r.slotIdx) } if r.dmDevice != nil { warnErr("dm-snapshot remove error", r.sandboxID, devicemapper.RemoveSnapshot(context.Background(), r.dmDevice)) } if r.rollCow != nil { r.rollCow() } else if r.cowPath != "" { os.Remove(r.cowPath) } if r.loopImage != "" { r.loops.Release(r.loopImage) } } // startCrashWatcher monitors the VM process for unexpected exits. // If the process exits while the sandbox is still in m.boxes (i.e. not a // deliberate Destroy), the sandbox is cleaned up and a sandbox.error event // is pushed to the control plane. func (m *Manager) startCrashWatcher(sb *sandboxState) { v, ok := m.vm.Get(sb.ID) if !ok { return } go func() { select { case <-v.Exited(): case <-m.stopCh: return } // Check if this was a deliberate Destroy/Pause (sandbox already removed // from boxes, or Pause owns the cleanup). StatusPaused must also bail // because the crash watcher races with Pause flipping status to Paused // after vm.Destroy is called as part of releaseRuntime. m.mu.Lock() _, stillAlive := m.boxes[sb.ID] if stillAlive && (sb.Status == models.StatusPausing || sb.Status == models.StatusPaused) { stillAlive = false } if stillAlive { delete(m.boxes, sb.ID) } m.mu.Unlock() if !stillAlive { return } slog.Error("VM process crashed, cleaning up", "id", sb.ID) sb.lifecycleMu.Lock() m.cleanupAfterCrash(sb) sb.lifecycleMu.Unlock() if m.onDestroy != nil { m.onDestroy(sb.ID) } if m.eventSender != nil { m.eventSender.SendAsync(LifecycleEvent{ Event: "sandbox.error", SandboxID: sb.ID, }) } }() } // cleanupAfterCrash tears down sandbox resources after a VM crash. // The VM process is already dead so we skip vm.Destroy and just clean up // network, device-mapper, and loop devices. func (m *Manager) cleanupAfterCrash(sb *sandboxState) { if sb.memLoadCancel != nil { sb.memLoadCancel() if sb.memLoadDone != nil { <-sb.memLoadDone } } m.stopSampler(sb) // Remove the VM from the vm.Manager's map (process is already dead). _ = m.vm.Destroy(context.Background(), sb.ID) if err := network.RemoveNetwork(sb.slot); err != nil { slog.Warn("crash cleanup: network error", "id", sb.ID, "error", err) } m.slots.Release(sb.SlotIndex) if sb.dmDevice != nil { if err := devicemapper.RemoveSnapshot(context.Background(), sb.dmDevice); err != nil { slog.Warn("crash cleanup: dm-snapshot error", "id", sb.ID, "error", err) } } if sb.baseImagePath != "" { m.loops.Release(sb.baseImagePath) } if err := os.RemoveAll(layout.SandboxDir(m.cfg.WrennDir, sb.ID)); err != nil { slog.Warn("crash cleanup: sandbox dir error", "id", sb.ID, "error", err) } } // startSampler resolves the VMM PID and starts a background goroutine // that samples CPU/mem/disk at 1s intervals into the ring buffer. // Must be called after the sandbox is registered in m.boxes. func (m *Manager) startSampler(sb *sandboxState) { v, ok := m.vm.Get(sb.ID) if !ok { slog.Warn("metrics: VM not found, skipping sampler", "id", sb.ID) return } // v.PID() is the cmd.Process.Pid of the "unshare -m -- bash -c script" // invocation. The exec chain (unshare → bash → ip netns exec → cloud-hypervisor) // occupies the same PID. v.PID() IS the VMM PID. vmmPID := v.PID() sb.vmmPID = vmmPID sb.ring = newMetricsRing() ctx, cancel := context.WithCancel(context.Background()) sb.samplerCancel = cancel sb.samplerDone = make(chan struct{}) // Read initial CPU counters for delta calculation. // Passed to goroutine as local state — no shared mutation. initialCPU, err := readCPUStat(vmmPID) if err != nil { slog.Warn("metrics: could not read initial CPU stat", "id", sb.ID, "error", err) } go m.samplerLoop(ctx, sb, vmmPID, sb.VCPUs, initialCPU) } // samplerLoop samples metrics at 1s intervals. // lastCPU is goroutine-local to avoid shared-state races. func (m *Manager) samplerLoop(ctx context.Context, sb *sandboxState, vmmPID, vcpus int, lastCPU cpuStat) { defer close(sb.samplerDone) ticker := time.NewTicker(1 * time.Second) defer ticker.Stop() clkTck := 100.0 // sysconf(_SC_CLK_TCK), almost always 100 on Linux lastTime := time.Now() cpuInitialized := lastCPU != (cpuStat{}) for { select { case <-ctx.Done(): return case now := <-ticker.C: elapsed := now.Sub(lastTime).Seconds() lastTime = now // CPU: delta jiffies / (elapsed * CLK_TCK * vcpus) * 100 var cpuPct float64 cur, err := readCPUStat(vmmPID) if err == nil { if cpuInitialized && elapsed > 0 && vcpus > 0 { deltaJiffies := float64((cur.utime + cur.stime) - (lastCPU.utime + lastCPU.stime)) cpuPct = (deltaJiffies / (elapsed * clkTck * float64(vcpus))) * 100.0 if cpuPct > 100.0 { cpuPct = 100.0 } if cpuPct < 0 { cpuPct = 0 } } lastCPU = cur cpuInitialized = true } // Memory & disk: guest-reported metrics from envd /metrics. // Using the guest's own view for both is accurate and avoids // host-side CoW file quirks (sparse allocation, silent errors). var memBytes, diskBytes int64 if m, err := readEnvdMetrics(ctx, sb.client.Load()); err == nil { memBytes = m.MemBytes diskBytes = m.DiskBytes } sb.ring.Push(MetricPoint{ Timestamp: now, CPUPct: cpuPct, MemBytes: memBytes, DiskBytes: diskBytes, }) } } } // stopSampler stops the metrics sampling goroutine and waits for it to exit. func (m *Manager) stopSampler(sb *sandboxState) { if sb.samplerCancel != nil { sb.samplerCancel() <-sb.samplerDone sb.samplerCancel = nil } } // GetMetrics returns the ring buffer data for the given range tier. // Valid ranges: "10m", "2h", "24h". func (m *Manager) GetMetrics(sandboxID, rangeTier string) ([]MetricPoint, error) { m.mu.RLock() sb, ok := m.boxes[sandboxID] m.mu.RUnlock() if !ok { return nil, fmt.Errorf("%w: %s", ErrNotFound, sandboxID) } if sb.ring == nil { return nil, nil } // Map the requested range to the appropriate ring tier and time cutoff. var points []MetricPoint var cutoff time.Duration switch rangeTier { case "5m": points = sb.ring.Get10m() cutoff = 5 * time.Minute case "10m": points = sb.ring.Get10m() cutoff = 10 * time.Minute case "1h": points = sb.ring.Get2h() cutoff = 1 * time.Hour case "2h": points = sb.ring.Get2h() cutoff = 2 * time.Hour case "6h": points = sb.ring.Get24h() cutoff = 6 * time.Hour case "12h": points = sb.ring.Get24h() cutoff = 12 * time.Hour case "24h": points = sb.ring.Get24h() cutoff = 24 * time.Hour default: return nil, fmt.Errorf("%w: %s (valid: 5m, 10m, 1h, 2h, 6h, 12h, 24h)", ErrInvalidRange, rangeTier) } // Filter points to the requested time window. threshold := time.Now().Add(-cutoff) filtered := points[:0:0] for _, p := range points { if !p.Timestamp.Before(threshold) { filtered = append(filtered, p) } } return filtered, nil } // FlushMetrics returns all three tier ring buffers, clears the ring, and // stops the sampler goroutine. Called by the control plane before pause/destroy. func (m *Manager) FlushMetrics(sandboxID string) (pts10m, pts2h, pts24h []MetricPoint, err error) { m.mu.RLock() sb, ok := m.boxes[sandboxID] m.mu.RUnlock() if !ok { return nil, nil, nil, fmt.Errorf("%w: %s", ErrNotFound, sandboxID) } m.stopSampler(sb) if sb.ring == nil { return nil, nil, nil, nil } pts10m, pts2h, pts24h = sb.ring.Flush() return pts10m, pts2h, pts24h, nil }