forked from wrenn/wrenn
Co-authored-by: Tasnim Kabir Sadik <tksadik@omukk.dev> Reviewed-on: wrenn/wrenn#50
1249 lines
40 KiB
Go
1249 lines
40 KiB
Go
package sandbox
|
||
|
||
import (
|
||
"context"
|
||
"errors"
|
||
"fmt"
|
||
"log/slog"
|
||
"net"
|
||
"os"
|
||
"path/filepath"
|
||
"sync"
|
||
"sync/atomic"
|
||
"syscall"
|
||
"time"
|
||
|
||
"github.com/jackc/pgx/v5/pgtype"
|
||
|
||
"git.omukk.dev/wrenn/wrenn/internal/devicemapper"
|
||
"git.omukk.dev/wrenn/wrenn/internal/envdclient"
|
||
"git.omukk.dev/wrenn/wrenn/internal/layout"
|
||
"git.omukk.dev/wrenn/wrenn/internal/models"
|
||
"git.omukk.dev/wrenn/wrenn/internal/network"
|
||
"git.omukk.dev/wrenn/wrenn/internal/vm"
|
||
"git.omukk.dev/wrenn/wrenn/pkg/id"
|
||
envdpb "git.omukk.dev/wrenn/wrenn/proto/envd/gen"
|
||
)
|
||
|
||
// Sentinel errors. Use errors.Is to detect them rather than string-matching.
|
||
var (
|
||
// ErrNotFound is returned when a sandbox is not present in the in-memory map.
|
||
ErrNotFound = errors.New("sandbox not found")
|
||
// ErrNotRunning is returned when an operation requires StatusRunning but
|
||
// the sandbox is in another state (or its envd client has been cleared
|
||
// concurrently by a pause).
|
||
ErrNotRunning = errors.New("sandbox not running")
|
||
// ErrNotPaused is returned when an operation requires StatusPaused but
|
||
// the sandbox is in another state.
|
||
ErrNotPaused = errors.New("sandbox not paused")
|
||
// ErrInvalidRange is returned when a metrics range parameter is invalid.
|
||
ErrInvalidRange = errors.New("invalid range")
|
||
)
|
||
|
||
// MinTimeoutSec is the minimum inactivity TTL accepted by Create/Resume.
|
||
// 0 keeps the "no TTL" semantic; any positive value below this is clamped.
|
||
//
|
||
// Rationale: very short TTLs race the post-create/post-resume startup window
|
||
// (m.boxes insertion → /init → startMemoryLoader). With memLoadDone unset
|
||
// for a brief moment, the reaper guard does not fire and a sub-second
|
||
// TimeoutSec could auto-pause a sandbox before its memory loader arms,
|
||
// producing a stale ch.snapshot. 60s is well above the startup envelope.
|
||
const MinTimeoutSec = 60
|
||
|
||
// clampTimeout normalises a caller-supplied TTL. 0 means "no TTL" and is
|
||
// preserved; positive values are floored at MinTimeoutSec.
|
||
func clampTimeout(timeoutSec int) int {
|
||
if timeoutSec <= 0 {
|
||
return 0
|
||
}
|
||
if timeoutSec < MinTimeoutSec {
|
||
return MinTimeoutSec
|
||
}
|
||
return timeoutSec
|
||
}
|
||
|
||
// envdReadyTimeoutFloor is the minimum time to wait for envd's /health to
|
||
// answer after a fresh boot or restore.
|
||
const envdReadyTimeoutFloor = 120 * time.Second
|
||
|
||
// envdReadyTimeoutPerGB scales the wait budget with guest RAM: larger VMs
|
||
// take longer to cold-boot (struct-page init, multi-vCPU bringup, cold
|
||
// dm-snapshot I/O).
|
||
const envdReadyTimeoutPerGB = 8 * time.Second
|
||
|
||
// envdReadyTimeout returns the WaitUntilReady deadline for a VM with the given
|
||
// memory size: 8s per GiB of RAM, floored at 120s. A 20 GiB guest gets 160s.
|
||
func envdReadyTimeout(memoryMB int) time.Duration {
|
||
gb := (memoryMB + 1023) / 1024 // round up
|
||
scaled := time.Duration(gb) * envdReadyTimeoutPerGB
|
||
if scaled < envdReadyTimeoutFloor {
|
||
return envdReadyTimeoutFloor
|
||
}
|
||
return scaled
|
||
}
|
||
|
||
// Config holds the paths and defaults for the sandbox manager.
|
||
type Config struct {
|
||
WrennDir string // root directory (e.g. /var/lib/wrenn); all sub-paths derived via layout package
|
||
EnvdTimeout time.Duration
|
||
DefaultRootfsSizeMB int // target size for template rootfs images; 0 → DefaultDiskSizeMB
|
||
|
||
// Resolved at startup by the host agent.
|
||
KernelPath string // path to the latest vmlinux-x.y.z
|
||
KernelVersion string // semver extracted from filename
|
||
VMMBin string // path to the cloud-hypervisor binary
|
||
VMMVersion string // semver from cloud-hypervisor --version
|
||
AgentVersion string // host agent version (injected via ldflags)
|
||
}
|
||
|
||
// LifecycleEvent describes an autonomous state change initiated by the agent.
|
||
type LifecycleEvent struct {
|
||
Event string
|
||
SandboxID string
|
||
}
|
||
|
||
// EventSender sends autonomous lifecycle events to the control plane.
|
||
// SendAsync is fire-and-forget; Send blocks with retries and returns the
|
||
// final error so callers running under a shutdown deadline can guarantee
|
||
// delivery before process exit.
|
||
type EventSender interface {
|
||
SendAsync(event LifecycleEvent)
|
||
Send(ctx context.Context, event LifecycleEvent) error
|
||
}
|
||
|
||
// ErrDraining is returned by Create / Resume when the manager has begun
|
||
// shutdown. The agent process is about to pause every running sandbox and
|
||
// exit; admitting new lifecycle work would race the destroy loop and leave
|
||
// orphaned VMs after the process is gone.
|
||
var ErrDraining = errors.New("agent is draining for shutdown")
|
||
|
||
// Manager orchestrates sandbox lifecycle: VM, network, filesystem, envd.
|
||
type Manager struct {
|
||
cfg Config
|
||
vm *vm.Manager
|
||
slots *network.SlotAllocator
|
||
loops *devicemapper.LoopRegistry
|
||
mu sync.RWMutex
|
||
boxes map[string]*sandboxState
|
||
stopCh chan struct{}
|
||
// draining is set at the start of Shutdown. Create and Resume check it
|
||
// (atomically, no lock needed) and refuse new work so the destroy loop
|
||
// can run to completion without racing fresh RPCs.
|
||
draining atomic.Bool
|
||
|
||
// creates tracks in-flight Create calls by sandbox ID. An entry exists
|
||
// only while Create is acquiring resources / booting the VM, before the
|
||
// sandbox lands in boxes. Destroy consults it to abort a create that
|
||
// would otherwise leak its half-built VM. Guarded by mu.
|
||
creates map[string]*createHandle
|
||
|
||
// onDestroy is called with the sandbox ID after cleanup completes.
|
||
// Used by ProxyHandler to evict cached reverse proxies.
|
||
onDestroy func(sandboxID string)
|
||
|
||
// eventSender sends autonomous lifecycle events (auto-pause, auto-destroy)
|
||
// to the CP via HTTP callback. Optional — nil means events are only
|
||
// propagated through the HostMonitor reconciler.
|
||
eventSender EventSender
|
||
}
|
||
|
||
// SetOnDestroy registers a callback invoked after each sandbox is cleaned up.
|
||
func (m *Manager) SetOnDestroy(fn func(sandboxID string)) {
|
||
m.onDestroy = fn
|
||
}
|
||
|
||
// SetEventSender registers the callback sender for autonomous lifecycle events.
|
||
func (m *Manager) SetEventSender(sender EventSender) {
|
||
m.eventSender = sender
|
||
}
|
||
|
||
// sandboxState holds the runtime state for a single sandbox.
|
||
type sandboxState struct {
|
||
models.Sandbox
|
||
lifecycleMu sync.Mutex // serializes Pause/Destroy/Resume on this sandbox
|
||
slot *network.Slot
|
||
// client is published via atomic.Pointer so Exec/Pty/Process callers can
|
||
// load it without holding lifecycleMu. Pause's releaseRuntime stores nil;
|
||
// Resume stores a fresh client. Callers MUST nil-check after Load.
|
||
client atomic.Pointer[envdclient.Client]
|
||
connTracker *ConnTracker // tracks in-flight proxy connections for pre-pause drain
|
||
dmDevice *devicemapper.SnapshotDevice
|
||
baseImagePath string // path to the base template rootfs (for loop registry release)
|
||
|
||
// sandboxDirOverride, when non-empty, pins this sandbox's VMConfig.SandboxDir
|
||
// to a path other than the default vm.SandboxTmpDir(sb.ID). Set when the
|
||
// sandbox was launched from a snapshot template — CH's saved config.json
|
||
// hardcodes the *original* source sandbox's tmpfs path, so every subsequent
|
||
// restore (Resume, PauseAll/restart) must reuse that same path or CH cannot
|
||
// find rootfs.ext4 in the new mount namespace.
|
||
sandboxDirOverride string
|
||
|
||
// Background memory loading state (set during Resume for UFFD sandboxes).
|
||
// nil for freshly-created sandboxes. For resumed sandboxes, memLoadDone
|
||
// is closed when the background loader finishes (success or failure).
|
||
memLoadDone chan struct{} // closed when background memory loader exits
|
||
memLoadCancel context.CancelFunc // cancels the background loader goroutine
|
||
|
||
// Metrics sampling state.
|
||
vmmPID int // VMM process PID (child of unshare wrapper)
|
||
ring *metricsRing // tiered ring buffers for CPU/mem/disk metrics
|
||
samplerCancel context.CancelFunc // cancels the per-sandbox sampling goroutine
|
||
samplerDone chan struct{} // closed when the sampling goroutine exits
|
||
}
|
||
|
||
// buildMetadata constructs the metadata map with version information.
|
||
func (m *Manager) buildMetadata(envdVersion string) map[string]string {
|
||
meta := map[string]string{
|
||
"kernel_version": m.cfg.KernelVersion,
|
||
"vmm_version": m.cfg.VMMVersion,
|
||
"agent_version": m.cfg.AgentVersion,
|
||
}
|
||
if envdVersion != "" {
|
||
meta["envd_version"] = envdVersion
|
||
}
|
||
return meta
|
||
}
|
||
|
||
// createHandle coordinates an in-flight Create with a concurrent Destroy.
|
||
// cancel aborts the creation context; done is closed once Create has fully
|
||
// finished — whether it succeeded or rolled back a partial failure.
|
||
type createHandle struct {
|
||
cancel context.CancelFunc
|
||
done chan struct{}
|
||
}
|
||
|
||
// New creates a new sandbox manager.
|
||
func New(cfg Config) *Manager {
|
||
if cfg.EnvdTimeout == 0 {
|
||
cfg.EnvdTimeout = 30 * time.Second
|
||
}
|
||
return &Manager{
|
||
cfg: cfg,
|
||
vm: vm.NewManager(),
|
||
slots: network.NewSlotAllocator(),
|
||
loops: devicemapper.NewLoopRegistry(),
|
||
boxes: make(map[string]*sandboxState),
|
||
creates: make(map[string]*createHandle),
|
||
stopCh: make(chan struct{}),
|
||
}
|
||
}
|
||
|
||
// TemplateRootfsSize returns the actual disk usage of a template's rootfs
|
||
// file on this host. Uses block-level accounting (stat.Blocks * 512) so
|
||
// sparse files (even after EnsureImageSizes expansion) report only the
|
||
// blocks that are actually allocated on disk.
|
||
func (m *Manager) TemplateRootfsSize(teamID, templateID pgtype.UUID) (int64, error) {
|
||
path := layout.TemplateRootfs(m.cfg.WrennDir, teamID, templateID)
|
||
info, err := os.Stat(path)
|
||
if err != nil {
|
||
return 0, fmt.Errorf("stat template rootfs: %w", err)
|
||
}
|
||
if sys, ok := info.Sys().(*syscall.Stat_t); ok {
|
||
return sys.Blocks * 512, nil
|
||
}
|
||
return info.Size(), nil
|
||
}
|
||
|
||
// Create boots a new sandbox. If the template's TemplateDir contains a CH
|
||
// memory snapshot (state.json + config.json) it is restored via CH's
|
||
// --restore + UFFD lazy memory; otherwise a fresh boot from the flattened
|
||
// rootfs is performed. defaultUser/defaultEnv are forwarded to envd's /init
|
||
// in both paths.
|
||
//
|
||
// If sandboxID is empty, a new ID is generated.
|
||
func (m *Manager) Create(
|
||
ctx context.Context,
|
||
sandboxID string,
|
||
teamID, templateID pgtype.UUID,
|
||
vcpus, memoryMB, timeoutSec, diskSizeMB int,
|
||
defaultUser string,
|
||
defaultEnv map[string]string,
|
||
) (*models.Sandbox, int64, error) {
|
||
if m.draining.Load() {
|
||
return nil, 0, ErrDraining
|
||
}
|
||
if sandboxID == "" {
|
||
sandboxID = id.FormatSandboxID(id.NewSandboxID())
|
||
}
|
||
|
||
if vcpus <= 0 {
|
||
vcpus = 1
|
||
}
|
||
if memoryMB <= 0 {
|
||
memoryMB = 512
|
||
}
|
||
if diskSizeMB <= 0 {
|
||
diskSizeMB = m.cfg.DefaultRootfsSizeMB
|
||
}
|
||
timeoutSec = clampTimeout(timeoutSec)
|
||
|
||
// Register an in-flight create handle before acquiring any resources so a
|
||
// concurrent Destroy can abort this creation and wait for its rollback.
|
||
// Without this, a Destroy that arrives while the VM is still booting finds
|
||
// nothing in m.boxes, no-ops, and Create races on to register a VM that no
|
||
// caller owns — a permanent VM / dm / network / loop leak.
|
||
createCtx, cancelCreate := context.WithCancel(ctx)
|
||
handle := &createHandle{cancel: cancelCreate, done: make(chan struct{})}
|
||
m.mu.Lock()
|
||
if _, exists := m.boxes[sandboxID]; exists {
|
||
m.mu.Unlock()
|
||
cancelCreate()
|
||
return nil, 0, fmt.Errorf("sandbox %s already exists", sandboxID)
|
||
}
|
||
if _, inflight := m.creates[sandboxID]; inflight {
|
||
m.mu.Unlock()
|
||
cancelCreate()
|
||
return nil, 0, fmt.Errorf("sandbox %s create already in progress", sandboxID)
|
||
}
|
||
m.creates[sandboxID] = handle
|
||
m.mu.Unlock()
|
||
defer func() {
|
||
m.mu.Lock()
|
||
delete(m.creates, sandboxID)
|
||
m.mu.Unlock()
|
||
cancelCreate()
|
||
close(handle.done)
|
||
}()
|
||
// All subsequent steps run under the cancellable create context so a
|
||
// concurrent Destroy can interrupt a slow VM boot / envd readiness wait.
|
||
ctx = createCtx
|
||
|
||
// Snapshot template? Route to the CH-restore path; the launcher manages
|
||
// its own resource lifecycle and registers the sandbox itself.
|
||
//
|
||
// System base templates never carry a memory snapshot; guarding here
|
||
// prevents a stray state.json (e.g. from a failed CreateSnapshot that
|
||
// mis-targeted a base template) from silently rerouting fresh boots into
|
||
// the restore path with a confusing error downstream.
|
||
templateDir := layout.TemplateDir(m.cfg.WrennDir, teamID, templateID)
|
||
if !layout.IsSystemTemplate(teamID, templateID) && layout.IsSnapshotTemplate(templateDir) {
|
||
return m.createFromSnapshotTemplate(ctx, sandboxID, teamID, templateID,
|
||
vcpus, memoryMB, timeoutSec, diskSizeMB, defaultUser, defaultEnv)
|
||
}
|
||
|
||
// Resolve base rootfs image.
|
||
baseRootfs := layout.TemplateRootfs(m.cfg.WrennDir, teamID, templateID)
|
||
if _, err := os.Stat(baseRootfs); err != nil {
|
||
return nil, 0, fmt.Errorf("base rootfs not found at %s: %w", baseRootfs, err)
|
||
}
|
||
|
||
// Acquire shared read-only loop device for the base image.
|
||
originLoop, err := m.loops.Acquire(baseRootfs)
|
||
if err != nil {
|
||
return nil, 0, fmt.Errorf("acquire loop device: %w", err)
|
||
}
|
||
|
||
originSize, err := devicemapper.OriginSizeBytes(originLoop)
|
||
if err != nil {
|
||
m.loops.Release(baseRootfs)
|
||
return nil, 0, fmt.Errorf("get origin size: %w", err)
|
||
}
|
||
|
||
// Create dm-snapshot with per-sandbox CoW file.
|
||
// CoW must be at least as large as the origin — if every block is
|
||
// rewritten, the CoW stores a full copy. Undersized CoW causes
|
||
// dm-snapshot invalidation → EIO on all guest I/O.
|
||
dmName := "wrenn-" + sandboxID
|
||
if err := os.MkdirAll(layout.SandboxDir(m.cfg.WrennDir, sandboxID), 0o755); err != nil {
|
||
m.loops.Release(baseRootfs)
|
||
return nil, 0, fmt.Errorf("create sandbox dir: %w", err)
|
||
}
|
||
cowPath := layout.SandboxCowPath(m.cfg.WrennDir, sandboxID)
|
||
cowSize := max(int64(diskSizeMB)*1024*1024, originSize)
|
||
dmDev, err := devicemapper.CreateSnapshot(dmName, originLoop, cowPath, originSize, cowSize)
|
||
if err != nil {
|
||
m.loops.Release(baseRootfs)
|
||
return nil, 0, fmt.Errorf("create dm-snapshot: %w", err)
|
||
}
|
||
|
||
res := &createResources{
|
||
sandboxID: sandboxID,
|
||
loops: m.loops,
|
||
loopImage: baseRootfs,
|
||
dmDevice: dmDev,
|
||
cowPath: cowPath,
|
||
slots: m.slots,
|
||
}
|
||
|
||
// Allocate network slot.
|
||
slotIdx, err := m.slots.Allocate()
|
||
if err != nil {
|
||
res.rollback()
|
||
return nil, 0, fmt.Errorf("allocate network slot: %w", err)
|
||
}
|
||
res.slotIdx = slotIdx
|
||
slot := network.NewSlot(slotIdx)
|
||
|
||
// Set up network.
|
||
if err := network.CreateNetwork(slot); err != nil {
|
||
res.rollback()
|
||
return nil, 0, fmt.Errorf("create network: %w", err)
|
||
}
|
||
res.slot = slot
|
||
|
||
// Boot VM — CH gets the dm device path.
|
||
vmCfg := vm.VMConfig{
|
||
SandboxID: sandboxID,
|
||
TemplateID: id.UUIDString(templateID),
|
||
KernelPath: m.cfg.KernelPath,
|
||
RootfsPath: dmDev.DevicePath,
|
||
VCPUs: vcpus,
|
||
MemoryMB: memoryMB,
|
||
NetworkNamespace: slot.NamespaceID,
|
||
TapDevice: slot.TapName,
|
||
TapMAC: slot.TapMAC,
|
||
GuestIP: slot.GuestIP,
|
||
GatewayIP: slot.TapIP,
|
||
NetMask: slot.GuestNetMask,
|
||
VMMBin: m.cfg.VMMBin,
|
||
LogDir: filepath.Join(m.cfg.WrennDir, "logs"),
|
||
}
|
||
|
||
if _, err := m.vm.Create(ctx, vmCfg); err != nil {
|
||
res.rollback()
|
||
return nil, 0, fmt.Errorf("create VM: %w", err)
|
||
}
|
||
res.vm = m.vm
|
||
|
||
// Wait for envd to be ready. The budget scales with guest RAM — a large
|
||
// VM cold-boots slower than the minimal default.
|
||
client := envdclient.New(slot.HostIP.String())
|
||
waitCtx, waitCancel := context.WithTimeout(ctx, envdReadyTimeout(memoryMB))
|
||
defer waitCancel()
|
||
|
||
if err := client.WaitUntilReady(waitCtx); err != nil {
|
||
res.rollback()
|
||
return nil, 0, fmt.Errorf("wait for envd: %w", err)
|
||
}
|
||
|
||
// Fetch envd version (best-effort).
|
||
envdVersion, _ := client.FetchVersion(ctx)
|
||
|
||
// Apply template defaults via envd /init (no-op when both empty).
|
||
if defaultUser != "" || len(defaultEnv) > 0 {
|
||
initCtx, initCancel := context.WithTimeout(ctx, m.cfg.EnvdTimeout)
|
||
if err := client.PostInitWithDefaults(initCtx, defaultUser, defaultEnv, sandboxID, id.UUIDString(templateID)); err != nil {
|
||
slog.Warn("post-create PostInit failed", "id", sandboxID, "error", err)
|
||
}
|
||
initCancel()
|
||
}
|
||
|
||
now := time.Now()
|
||
sb := &sandboxState{
|
||
Sandbox: models.Sandbox{
|
||
ID: sandboxID,
|
||
Status: models.StatusRunning,
|
||
TemplateTeamID: teamID.Bytes,
|
||
TemplateID: templateID.Bytes,
|
||
VCPUs: vcpus,
|
||
MemoryMB: memoryMB,
|
||
TimeoutSec: timeoutSec,
|
||
SlotIndex: slotIdx,
|
||
HostIP: slot.HostIP,
|
||
RootfsPath: dmDev.DevicePath,
|
||
CreatedAt: now,
|
||
LastActiveAt: now,
|
||
Metadata: m.buildMetadata(envdVersion),
|
||
},
|
||
slot: slot,
|
||
connTracker: &ConnTracker{},
|
||
dmDevice: dmDev,
|
||
baseImagePath: baseRootfs,
|
||
}
|
||
sb.client.Store(client)
|
||
|
||
m.mu.Lock()
|
||
m.boxes[sandboxID] = sb
|
||
m.mu.Unlock()
|
||
|
||
m.startSampler(sb)
|
||
m.startCrashWatcher(sb)
|
||
|
||
slog.Info("sandbox created",
|
||
"id", sandboxID,
|
||
"team_id", teamID,
|
||
"template_id", templateID,
|
||
"host_ip", slot.HostIP.String(),
|
||
"dm_device", dmDev.DevicePath,
|
||
)
|
||
|
||
return &sb.Sandbox, cowSize, nil
|
||
}
|
||
|
||
// Destroy stops and cleans up a sandbox. If the sandbox is running, its VM,
|
||
// network, and rootfs are torn down. Any pause snapshot files are also removed.
|
||
func (m *Manager) Destroy(ctx context.Context, sandboxID string) error {
|
||
m.mu.Lock()
|
||
if handle, inflight := m.creates[sandboxID]; inflight {
|
||
// A create is still in flight. Cancel it and wait for its rollback to
|
||
// finish, otherwise the half-built VM / dm-snapshot / network / loop
|
||
// device it acquired would leak with no owner. If the create instead
|
||
// raced to success, it will have registered the sandbox in m.boxes by
|
||
// the time done is closed — the normal teardown below then runs.
|
||
m.mu.Unlock()
|
||
slog.Info("destroy: aborting in-flight sandbox create", "id", sandboxID)
|
||
handle.cancel()
|
||
<-handle.done
|
||
m.mu.Lock()
|
||
}
|
||
sb, ok := m.boxes[sandboxID]
|
||
// statusAtEntry distinguishes "user is destroying an already-paused
|
||
// sandbox" (legitimate cleanup → fall through) from "user is destroying
|
||
// a running sandbox that raced to Paused before we got lifecycleMu"
|
||
// (preserve snapshot → re-insert and bail). Captured under m.mu so it
|
||
// reflects the same generation as the boxes-map delete.
|
||
var statusAtEntry models.SandboxStatus
|
||
if ok {
|
||
statusAtEntry = sb.Status
|
||
delete(m.boxes, sandboxID)
|
||
}
|
||
m.mu.Unlock()
|
||
|
||
if ok {
|
||
// Wait for any in-progress Pause to finish before tearing down resources.
|
||
sb.lifecycleMu.Lock()
|
||
defer sb.lifecycleMu.Unlock()
|
||
|
||
// Racing-Pause guard. Only fires when the sandbox was NOT paused at
|
||
// entry but became paused while we waited for lifecycleMu — i.e. a
|
||
// concurrent Pause completed under us. In that case the snapshot was
|
||
// just written to disk and destroying now would wipe a freshly-paused
|
||
// sandbox. Re-insert into m.boxes (releaseRuntime already cleared
|
||
// runtime refs; slot reservation retained for Resume) and return nil
|
||
// so the agent's view stays consistent with the on-disk state.
|
||
//
|
||
// A legitimate Destroy of an already-paused sandbox (statusAtEntry ==
|
||
// Paused) falls through to cleanup, which releases the slot and
|
||
// removes the snapshot dir — the user explicitly asked for deletion.
|
||
if statusAtEntry != models.StatusPaused && sb.Status == models.StatusPaused {
|
||
m.mu.Lock()
|
||
m.boxes[sandboxID] = sb
|
||
m.mu.Unlock()
|
||
slog.Info("destroy: racing pause completed, preserving snapshot", "id", sandboxID)
|
||
return nil
|
||
}
|
||
m.cleanup(ctx, sb)
|
||
}
|
||
|
||
// Always clean up pause snapshot files (may exist if sandbox was paused).
|
||
if err := os.RemoveAll(layout.PauseSnapshotDir(m.cfg.WrennDir, sandboxID)); err != nil {
|
||
slog.Warn("snapshot cleanup error", "id", sandboxID, "error", err)
|
||
}
|
||
|
||
if m.onDestroy != nil {
|
||
m.onDestroy(sandboxID)
|
||
}
|
||
|
||
slog.Info("sandbox destroyed", "id", sandboxID)
|
||
return nil
|
||
}
|
||
|
||
// cleanup tears down all resources for a sandbox.
|
||
func (m *Manager) cleanup(ctx context.Context, sb *sandboxState) {
|
||
if sb.memLoadCancel != nil {
|
||
sb.memLoadCancel()
|
||
if sb.memLoadDone != nil {
|
||
<-sb.memLoadDone
|
||
}
|
||
}
|
||
m.stopSampler(sb)
|
||
if err := m.vm.Destroy(ctx, sb.ID); err != nil {
|
||
slog.Warn("vm destroy error", "id", sb.ID, "error", err)
|
||
}
|
||
if err := network.RemoveNetwork(sb.slot); err != nil {
|
||
slog.Warn("network cleanup error", "id", sb.ID, "error", err)
|
||
}
|
||
m.slots.Release(sb.SlotIndex)
|
||
|
||
// Tear down dm-snapshot and release the base image loop device.
|
||
if sb.dmDevice != nil {
|
||
if err := devicemapper.RemoveSnapshot(context.Background(), sb.dmDevice); err != nil {
|
||
slog.Warn("dm-snapshot remove error", "id", sb.ID, "error", err)
|
||
}
|
||
os.Remove(sb.dmDevice.CowPath)
|
||
}
|
||
// Paused branch: dm-snapshot and loop were already released by
|
||
// releaseRuntime; the CoW file inside the sandbox dir is removed by
|
||
// Destroy's os.RemoveAll(SandboxDir) below.
|
||
if sb.baseImagePath != "" {
|
||
m.loops.Release(sb.baseImagePath)
|
||
}
|
||
}
|
||
|
||
// Pause, Resume, CreateSnapshot, FlattenRootfs, DeleteSnapshot, PauseAll
|
||
// are implemented in pause.go.
|
||
|
||
// activeClient resolves sandboxID to its envd client when the sandbox is in
|
||
// StatusRunning and the client has not been cleared by a concurrent pause.
|
||
// It bumps LastActiveAt as a side effect. Returns ErrNotFound if missing or
|
||
// ErrNotRunning (wrapped with context) otherwise.
|
||
//
|
||
// All Exec/Pty/Process methods funnel through this — it is the single
|
||
// chokepoint that guarantees we never deref a stale sb.client.
|
||
func (m *Manager) activeClient(sandboxID string) (*envdclient.Client, error) {
|
||
sb, err := m.get(sandboxID)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
if sb.Status != models.StatusRunning {
|
||
return nil, fmt.Errorf("%w: %s (status: %s)", ErrNotRunning, sandboxID, sb.Status)
|
||
}
|
||
c := sb.client.Load()
|
||
if c == nil {
|
||
// Race: status flipped from Running between m.get and Load (pause's
|
||
// releaseRuntime cleared the pointer).
|
||
return nil, fmt.Errorf("%w: %s (client cleared)", ErrNotRunning, sandboxID)
|
||
}
|
||
m.mu.Lock()
|
||
sb.LastActiveAt = time.Now()
|
||
m.mu.Unlock()
|
||
return c, nil
|
||
}
|
||
|
||
// Exec runs a command inside a sandbox.
|
||
func (m *Manager) Exec(ctx context.Context, sandboxID string, cmd string, args []string, opts *envdclient.ExecOpts) (*envdclient.ExecResult, error) {
|
||
c, err := m.activeClient(sandboxID)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
return c.Exec(ctx, cmd, args, opts)
|
||
}
|
||
|
||
// ExecStream runs a command inside a sandbox and returns a channel of streaming events.
|
||
func (m *Manager) ExecStream(ctx context.Context, sandboxID string, cmd string, args ...string) (<-chan envdclient.ExecStreamEvent, error) {
|
||
c, err := m.activeClient(sandboxID)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
return c.ExecStream(ctx, cmd, args...)
|
||
}
|
||
|
||
// List returns all sandboxes.
|
||
func (m *Manager) List() []models.Sandbox {
|
||
m.mu.RLock()
|
||
defer m.mu.RUnlock()
|
||
|
||
result := make([]models.Sandbox, 0, len(m.boxes))
|
||
for _, sb := range m.boxes {
|
||
result = append(result, sb.Sandbox)
|
||
}
|
||
return result
|
||
}
|
||
|
||
// Get returns a sandbox by ID.
|
||
func (m *Manager) Get(sandboxID string) (*models.Sandbox, error) {
|
||
sb, err := m.get(sandboxID)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
return &sb.Sandbox, nil
|
||
}
|
||
|
||
// GetClient returns the envd client for a sandbox without bumping
|
||
// LastActiveAt. Used by the proxy path which has its own activity bookkeeping.
|
||
func (m *Manager) GetClient(sandboxID string) (*envdclient.Client, error) {
|
||
sb, err := m.get(sandboxID)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
if sb.Status != models.StatusRunning {
|
||
return nil, fmt.Errorf("%w: %s (status: %s)", ErrNotRunning, sandboxID, sb.Status)
|
||
}
|
||
c := sb.client.Load()
|
||
if c == nil {
|
||
return nil, fmt.Errorf("%w: %s (client cleared)", ErrNotRunning, sandboxID)
|
||
}
|
||
return c, nil
|
||
}
|
||
|
||
// SetDefaults calls envd's PostInit to configure the default user and
|
||
// environment variables for a running sandbox. This is called by the host
|
||
// agent after sandbox creation or resume when the template specifies defaults.
|
||
func (m *Manager) SetDefaults(ctx context.Context, sandboxID, defaultUser string, defaultEnv map[string]string) error {
|
||
if defaultUser == "" && len(defaultEnv) == 0 {
|
||
return nil
|
||
}
|
||
c, err := m.activeClient(sandboxID)
|
||
if err != nil {
|
||
return err
|
||
}
|
||
return c.PostInitWithDefaults(ctx, defaultUser, defaultEnv, "", "")
|
||
}
|
||
|
||
// PtyAttach starts a new PTY process or reconnects to an existing one.
|
||
// If cmd is non-empty, starts a new process. If empty, reconnects using tag.
|
||
func (m *Manager) PtyAttach(ctx context.Context, sandboxID, tag, cmd string, args []string, cols, rows uint32, envs map[string]string, cwd string) (<-chan envdclient.PtyEvent, error) {
|
||
c, err := m.activeClient(sandboxID)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
if cmd != "" {
|
||
return c.PtyStart(ctx, tag, cmd, args, cols, rows, envs, cwd)
|
||
}
|
||
return c.PtyConnect(ctx, tag)
|
||
}
|
||
|
||
// PtySendInput sends raw bytes to a PTY process in a sandbox.
|
||
func (m *Manager) PtySendInput(ctx context.Context, sandboxID, tag string, data []byte) error {
|
||
c, err := m.activeClient(sandboxID)
|
||
if err != nil {
|
||
return err
|
||
}
|
||
return c.PtySendInput(ctx, tag, data)
|
||
}
|
||
|
||
// PtyResize updates the terminal dimensions for a PTY process in a sandbox.
|
||
func (m *Manager) PtyResize(ctx context.Context, sandboxID, tag string, cols, rows uint32) error {
|
||
c, err := m.activeClient(sandboxID)
|
||
if err != nil {
|
||
return err
|
||
}
|
||
return c.PtyResize(ctx, tag, cols, rows)
|
||
}
|
||
|
||
// PtyKill sends SIGKILL to a PTY process in a sandbox.
|
||
func (m *Manager) PtyKill(ctx context.Context, sandboxID, tag string) error {
|
||
c, err := m.activeClient(sandboxID)
|
||
if err != nil {
|
||
return err
|
||
}
|
||
return c.PtyKill(ctx, tag)
|
||
}
|
||
|
||
// StartBackground starts a background process inside a sandbox.
|
||
func (m *Manager) StartBackground(ctx context.Context, sandboxID, tag, cmd string, args []string, envs map[string]string, cwd string) (uint32, error) {
|
||
c, err := m.activeClient(sandboxID)
|
||
if err != nil {
|
||
return 0, err
|
||
}
|
||
return c.StartBackground(ctx, tag, cmd, args, envs, cwd)
|
||
}
|
||
|
||
// ConnectProcess re-attaches to a running process inside a sandbox.
|
||
func (m *Manager) ConnectProcess(ctx context.Context, sandboxID string, pid uint32, tag string) (<-chan envdclient.ExecStreamEvent, error) {
|
||
c, err := m.activeClient(sandboxID)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
return c.ConnectProcess(ctx, pid, tag)
|
||
}
|
||
|
||
// ListProcesses returns all running processes inside a sandbox.
|
||
func (m *Manager) ListProcesses(ctx context.Context, sandboxID string) ([]envdclient.ProcessInfo, error) {
|
||
c, err := m.activeClient(sandboxID)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
return c.ListProcesses(ctx)
|
||
}
|
||
|
||
// KillProcess sends a signal to a process inside a sandbox.
|
||
func (m *Manager) KillProcess(ctx context.Context, sandboxID string, pid uint32, tag string, signal envdpb.Signal) error {
|
||
c, err := m.activeClient(sandboxID)
|
||
if err != nil {
|
||
return err
|
||
}
|
||
return c.KillProcess(ctx, pid, tag, signal)
|
||
}
|
||
|
||
// AcquireProxyConn atomically looks up a sandbox by ID and registers an
|
||
// in-flight proxy connection. Returns the sandbox's host-reachable IP, the
|
||
// connection tracker, and true on success. The caller must call
|
||
// tracker.Release() when the request completes. Returns zero values and
|
||
// false if the sandbox is not found, not running, or is draining for a pause.
|
||
func (m *Manager) AcquireProxyConn(sandboxID string) (net.IP, *ConnTracker, bool) {
|
||
m.mu.RLock()
|
||
sb, ok := m.boxes[sandboxID]
|
||
m.mu.RUnlock()
|
||
|
||
if !ok || sb.Status != models.StatusRunning {
|
||
return nil, nil, false
|
||
}
|
||
if !sb.connTracker.Acquire() {
|
||
return nil, nil, false
|
||
}
|
||
return sb.HostIP, sb.connTracker, true
|
||
}
|
||
|
||
// Ping resets the inactivity timer for a running sandbox.
|
||
func (m *Manager) Ping(sandboxID string) error {
|
||
m.mu.Lock()
|
||
defer m.mu.Unlock()
|
||
|
||
sb, ok := m.boxes[sandboxID]
|
||
if !ok {
|
||
return fmt.Errorf("%w: %s", ErrNotFound, sandboxID)
|
||
}
|
||
if sb.Status != models.StatusRunning {
|
||
return fmt.Errorf("%w: %s (status: %s)", ErrNotRunning, sandboxID, sb.Status)
|
||
}
|
||
sb.LastActiveAt = time.Now()
|
||
return nil
|
||
}
|
||
|
||
// DrainAutoPausedIDs returns IDs that auto-paused since the last drain.
|
||
// The autonomous pause paths (TTL reaper, PauseAll on shutdown / heartbeat
|
||
// failure) emit per-sandbox events through eventSender directly, so this
|
||
// list is currently unused. Retained for proto compatibility.
|
||
func (m *Manager) DrainAutoPausedIDs() []string {
|
||
return nil
|
||
}
|
||
|
||
func (m *Manager) get(sandboxID string) (*sandboxState, error) {
|
||
m.mu.RLock()
|
||
defer m.mu.RUnlock()
|
||
|
||
sb, ok := m.boxes[sandboxID]
|
||
if !ok {
|
||
return nil, fmt.Errorf("%w: %s", ErrNotFound, sandboxID)
|
||
}
|
||
return sb, nil
|
||
}
|
||
|
||
// StartTTLReaper starts a background goroutine that destroys sandboxes
|
||
// that have exceeded their TTL (timeout_sec of inactivity).
|
||
func (m *Manager) StartTTLReaper(ctx context.Context) {
|
||
go func() {
|
||
ticker := time.NewTicker(2 * time.Second)
|
||
defer ticker.Stop()
|
||
|
||
for {
|
||
select {
|
||
case <-ctx.Done():
|
||
return
|
||
case <-m.stopCh:
|
||
return
|
||
case <-ticker.C:
|
||
m.reapExpired(ctx)
|
||
}
|
||
}
|
||
}()
|
||
}
|
||
|
||
func (m *Manager) reapExpired(_ context.Context) {
|
||
m.mu.RLock()
|
||
var expired []string
|
||
now := time.Now()
|
||
for id, sb := range m.boxes {
|
||
if sb.TimeoutSec <= 0 {
|
||
continue
|
||
}
|
||
if sb.Status != models.StatusRunning {
|
||
continue
|
||
}
|
||
// Skip sandboxes still loading memory — they're initializing.
|
||
if sb.memLoadDone != nil {
|
||
select {
|
||
case <-sb.memLoadDone:
|
||
default:
|
||
continue
|
||
}
|
||
}
|
||
if now.Sub(sb.LastActiveAt) > time.Duration(sb.TimeoutSec)*time.Second {
|
||
expired = append(expired, id)
|
||
}
|
||
}
|
||
m.mu.RUnlock()
|
||
|
||
for _, id := range expired {
|
||
slog.Info("TTL expired, auto-pausing sandbox", "id", id)
|
||
pauseCtx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
|
||
err := m.Pause(pauseCtx, id)
|
||
cancel()
|
||
if err != nil {
|
||
slog.Warn("TTL auto-pause failed, destroying sandbox", "id", id, "error", err)
|
||
if destroyErr := m.Destroy(context.Background(), id); destroyErr != nil {
|
||
slog.Warn("TTL destroy after failed pause also failed", "id", id, "error", destroyErr)
|
||
} else if m.eventSender != nil {
|
||
m.eventSender.SendAsync(LifecycleEvent{
|
||
Event: "sandbox.stopped",
|
||
SandboxID: id,
|
||
})
|
||
}
|
||
continue
|
||
}
|
||
|
||
if m.eventSender != nil {
|
||
m.eventSender.SendAsync(LifecycleEvent{
|
||
Event: "sandbox.auto_paused",
|
||
SandboxID: id,
|
||
})
|
||
}
|
||
}
|
||
}
|
||
|
||
// Shutdown gracefully drains the manager. Running sandboxes are paused so
|
||
// their state survives across agent restarts; any sandboxes still holding
|
||
// runtime resources after PauseAll (e.g. paused failed, or status was
|
||
// Starting/Resuming/Error) are destroyed to release VM / dm / loop / netns.
|
||
// Finally the shared loop registry is fully released.
|
||
func (m *Manager) Shutdown(ctx context.Context) {
|
||
// Flip draining BEFORE close(stopCh) so any Create/Resume already inside
|
||
// its handler-goroutine sees the flag on its next check. Subsequent RPC
|
||
// handlers that load the flag get ErrDraining and return immediately.
|
||
m.draining.Store(true)
|
||
close(m.stopCh)
|
||
|
||
// Cancel in-flight Create calls and wait for them to settle. A slow create
|
||
// (envd readiness wait scales up to ~160s for large VMs) would otherwise
|
||
// register its VM in m.boxes after the destroy loop below has run, leaking
|
||
// it. After the wait each create has either rolled back or registered in
|
||
// m.boxes — where PauseAll / the destroy loop pick it up.
|
||
m.mu.Lock()
|
||
inflight := make([]*createHandle, 0, len(m.creates))
|
||
for _, h := range m.creates {
|
||
h.cancel()
|
||
inflight = append(inflight, h)
|
||
}
|
||
m.mu.Unlock()
|
||
for _, h := range inflight {
|
||
<-h.done
|
||
}
|
||
|
||
// Snapshot every running sandbox. PauseAll calls Pause per-sandbox which
|
||
// internally calls releaseRuntime → frees VM, network, dm-snapshot, and
|
||
// the base-image loop refcount.
|
||
slog.Info("shutdown: pausing running sandboxes")
|
||
m.PauseAll(ctx)
|
||
|
||
// Destroy anything still holding runtime resources. A Paused sandbox has
|
||
// already had releaseRuntime called, so re-destroying it is harmless but
|
||
// also unnecessary — we destroy regardless to remove it from the boxes
|
||
// map and to handle states where Pause failed or wasn't applicable.
|
||
m.mu.RLock()
|
||
ids := make([]string, 0, len(m.boxes))
|
||
for id, sb := range m.boxes {
|
||
// Paused sandboxes already had runtime freed by PauseAll. Leave the
|
||
// snapshot dir on disk so the next agent instance can resume them.
|
||
if sb.Status == models.StatusPaused {
|
||
continue
|
||
}
|
||
ids = append(ids, id)
|
||
}
|
||
m.mu.RUnlock()
|
||
|
||
for _, sbID := range ids {
|
||
slog.Info("shutdown: destroying sandbox", "id", sbID)
|
||
if err := m.Destroy(ctx, sbID); err != nil {
|
||
slog.Warn("shutdown destroy failed", "id", sbID, "error", err)
|
||
continue
|
||
}
|
||
// Notify CP so the DB row flips off running/pausing/error to stopped.
|
||
// Async: a sync Send with CP unreachable can burn ~31s per sandbox
|
||
// (3 × 10s HTTP timeout + backoff) and blow the 5min shutdown budget.
|
||
// Best-effort — if the agent process exits before the goroutine's
|
||
// HTTP request lands, HostMonitor's missing-confirmed-dead reconcile
|
||
// catches it after the next agent restart (it sees the sandbox in DB
|
||
// as 'running'/'missing' but not present in ListSandboxes → stopped).
|
||
if m.eventSender != nil {
|
||
m.eventSender.SendAsync(LifecycleEvent{
|
||
Event: "sandbox.stopped",
|
||
SandboxID: sbID,
|
||
})
|
||
}
|
||
}
|
||
|
||
m.loops.ReleaseAll()
|
||
}
|
||
|
||
// warnErr logs a warning if err is non-nil. Used for best-effort cleanup
|
||
// in error paths where the primary error has already been captured.
|
||
func warnErr(msg string, id string, err error) {
|
||
if err != nil {
|
||
slog.Warn(msg, "id", id, "error", err)
|
||
}
|
||
}
|
||
|
||
// createResources tracks partially-acquired resources during sandbox creation
|
||
// so they can be rolled back in reverse order on failure.
|
||
type createResources struct {
|
||
sandboxID string
|
||
loops *devicemapper.LoopRegistry
|
||
vm *vm.Manager
|
||
loopImage string
|
||
dmDevice *devicemapper.SnapshotDevice
|
||
cowPath string
|
||
slotIdx int
|
||
slots *network.SlotAllocator
|
||
slot *network.Slot
|
||
rollCow func() // optional custom cow rollback (e.g. rename back)
|
||
}
|
||
|
||
func (r *createResources) rollback() {
|
||
if r.vm != nil && r.sandboxID != "" {
|
||
warnErr("vm destroy error", r.sandboxID, r.vm.Destroy(context.Background(), r.sandboxID))
|
||
}
|
||
if r.slot != nil {
|
||
warnErr("network cleanup error", r.sandboxID, network.RemoveNetwork(r.slot))
|
||
}
|
||
if r.slots != nil && r.slotIdx > 0 {
|
||
r.slots.Release(r.slotIdx)
|
||
}
|
||
if r.dmDevice != nil {
|
||
warnErr("dm-snapshot remove error", r.sandboxID, devicemapper.RemoveSnapshot(context.Background(), r.dmDevice))
|
||
}
|
||
if r.rollCow != nil {
|
||
r.rollCow()
|
||
} else if r.cowPath != "" {
|
||
os.Remove(r.cowPath)
|
||
}
|
||
if r.loopImage != "" {
|
||
r.loops.Release(r.loopImage)
|
||
}
|
||
}
|
||
|
||
// startCrashWatcher monitors the VM process for unexpected exits.
|
||
// If the process exits while the sandbox is still in m.boxes (i.e. not a
|
||
// deliberate Destroy), the sandbox is cleaned up and a sandbox.error event
|
||
// is pushed to the control plane.
|
||
func (m *Manager) startCrashWatcher(sb *sandboxState) {
|
||
v, ok := m.vm.Get(sb.ID)
|
||
if !ok {
|
||
return
|
||
}
|
||
go func() {
|
||
select {
|
||
case <-v.Exited():
|
||
case <-m.stopCh:
|
||
return
|
||
}
|
||
|
||
// Check if this was a deliberate Destroy/Pause (sandbox already removed
|
||
// from boxes, or Pause owns the cleanup). StatusPaused must also bail
|
||
// because the crash watcher races with Pause flipping status to Paused
|
||
// after vm.Destroy is called as part of releaseRuntime.
|
||
m.mu.Lock()
|
||
_, stillAlive := m.boxes[sb.ID]
|
||
if stillAlive && (sb.Status == models.StatusPausing || sb.Status == models.StatusPaused) {
|
||
stillAlive = false
|
||
}
|
||
if stillAlive {
|
||
delete(m.boxes, sb.ID)
|
||
}
|
||
m.mu.Unlock()
|
||
|
||
if !stillAlive {
|
||
return
|
||
}
|
||
|
||
slog.Error("VM process crashed, cleaning up", "id", sb.ID)
|
||
|
||
sb.lifecycleMu.Lock()
|
||
m.cleanupAfterCrash(sb)
|
||
sb.lifecycleMu.Unlock()
|
||
|
||
if m.onDestroy != nil {
|
||
m.onDestroy(sb.ID)
|
||
}
|
||
|
||
if m.eventSender != nil {
|
||
m.eventSender.SendAsync(LifecycleEvent{
|
||
Event: "sandbox.error",
|
||
SandboxID: sb.ID,
|
||
})
|
||
}
|
||
}()
|
||
}
|
||
|
||
// cleanupAfterCrash tears down sandbox resources after a VM crash.
|
||
// The VM process is already dead so we skip vm.Destroy and just clean up
|
||
// network, device-mapper, and loop devices.
|
||
func (m *Manager) cleanupAfterCrash(sb *sandboxState) {
|
||
if sb.memLoadCancel != nil {
|
||
sb.memLoadCancel()
|
||
if sb.memLoadDone != nil {
|
||
<-sb.memLoadDone
|
||
}
|
||
}
|
||
m.stopSampler(sb)
|
||
|
||
// Remove the VM from the vm.Manager's map (process is already dead).
|
||
_ = m.vm.Destroy(context.Background(), sb.ID)
|
||
|
||
if err := network.RemoveNetwork(sb.slot); err != nil {
|
||
slog.Warn("crash cleanup: network error", "id", sb.ID, "error", err)
|
||
}
|
||
m.slots.Release(sb.SlotIndex)
|
||
|
||
if sb.dmDevice != nil {
|
||
if err := devicemapper.RemoveSnapshot(context.Background(), sb.dmDevice); err != nil {
|
||
slog.Warn("crash cleanup: dm-snapshot error", "id", sb.ID, "error", err)
|
||
}
|
||
}
|
||
if sb.baseImagePath != "" {
|
||
m.loops.Release(sb.baseImagePath)
|
||
}
|
||
if err := os.RemoveAll(layout.SandboxDir(m.cfg.WrennDir, sb.ID)); err != nil {
|
||
slog.Warn("crash cleanup: sandbox dir error", "id", sb.ID, "error", err)
|
||
}
|
||
}
|
||
|
||
// startSampler resolves the VMM PID and starts a background goroutine
|
||
// that samples CPU/mem/disk at 1s intervals into the ring buffer.
|
||
// Must be called after the sandbox is registered in m.boxes.
|
||
func (m *Manager) startSampler(sb *sandboxState) {
|
||
v, ok := m.vm.Get(sb.ID)
|
||
if !ok {
|
||
slog.Warn("metrics: VM not found, skipping sampler", "id", sb.ID)
|
||
return
|
||
}
|
||
|
||
// v.PID() is the cmd.Process.Pid of the "unshare -m -- bash -c script"
|
||
// invocation. The exec chain (unshare → bash → ip netns exec → cloud-hypervisor)
|
||
// occupies the same PID. v.PID() IS the VMM PID.
|
||
vmmPID := v.PID()
|
||
|
||
sb.vmmPID = vmmPID
|
||
sb.ring = newMetricsRing()
|
||
|
||
ctx, cancel := context.WithCancel(context.Background())
|
||
sb.samplerCancel = cancel
|
||
sb.samplerDone = make(chan struct{})
|
||
|
||
// Read initial CPU counters for delta calculation.
|
||
// Passed to goroutine as local state — no shared mutation.
|
||
initialCPU, err := readCPUStat(vmmPID)
|
||
if err != nil {
|
||
slog.Warn("metrics: could not read initial CPU stat", "id", sb.ID, "error", err)
|
||
}
|
||
|
||
go m.samplerLoop(ctx, sb, vmmPID, sb.VCPUs, initialCPU)
|
||
}
|
||
|
||
// samplerLoop samples metrics at 1s intervals.
|
||
// lastCPU is goroutine-local to avoid shared-state races.
|
||
func (m *Manager) samplerLoop(ctx context.Context, sb *sandboxState, vmmPID, vcpus int, lastCPU cpuStat) {
|
||
defer close(sb.samplerDone)
|
||
|
||
ticker := time.NewTicker(1 * time.Second)
|
||
defer ticker.Stop()
|
||
|
||
clkTck := 100.0 // sysconf(_SC_CLK_TCK), almost always 100 on Linux
|
||
lastTime := time.Now()
|
||
cpuInitialized := lastCPU != (cpuStat{})
|
||
|
||
for {
|
||
select {
|
||
case <-ctx.Done():
|
||
return
|
||
case now := <-ticker.C:
|
||
elapsed := now.Sub(lastTime).Seconds()
|
||
lastTime = now
|
||
|
||
// CPU: delta jiffies / (elapsed * CLK_TCK * vcpus) * 100
|
||
var cpuPct float64
|
||
cur, err := readCPUStat(vmmPID)
|
||
if err == nil {
|
||
if cpuInitialized && elapsed > 0 && vcpus > 0 {
|
||
deltaJiffies := float64((cur.utime + cur.stime) - (lastCPU.utime + lastCPU.stime))
|
||
cpuPct = (deltaJiffies / (elapsed * clkTck * float64(vcpus))) * 100.0
|
||
if cpuPct > 100.0 {
|
||
cpuPct = 100.0
|
||
}
|
||
if cpuPct < 0 {
|
||
cpuPct = 0
|
||
}
|
||
}
|
||
lastCPU = cur
|
||
cpuInitialized = true
|
||
}
|
||
|
||
// Memory & disk: guest-reported metrics from envd /metrics.
|
||
// Using the guest's own view for both is accurate and avoids
|
||
// host-side CoW file quirks (sparse allocation, silent errors).
|
||
var memBytes, diskBytes int64
|
||
if m, err := readEnvdMetrics(ctx, sb.client.Load()); err == nil {
|
||
memBytes = m.MemBytes
|
||
diskBytes = m.DiskBytes
|
||
}
|
||
|
||
sb.ring.Push(MetricPoint{
|
||
Timestamp: now,
|
||
CPUPct: cpuPct,
|
||
MemBytes: memBytes,
|
||
DiskBytes: diskBytes,
|
||
})
|
||
}
|
||
}
|
||
}
|
||
|
||
// stopSampler stops the metrics sampling goroutine and waits for it to exit.
|
||
func (m *Manager) stopSampler(sb *sandboxState) {
|
||
if sb.samplerCancel != nil {
|
||
sb.samplerCancel()
|
||
<-sb.samplerDone
|
||
sb.samplerCancel = nil
|
||
}
|
||
}
|
||
|
||
// GetMetrics returns the ring buffer data for the given range tier.
|
||
// Valid ranges: "10m", "2h", "24h".
|
||
func (m *Manager) GetMetrics(sandboxID, rangeTier string) ([]MetricPoint, error) {
|
||
m.mu.RLock()
|
||
sb, ok := m.boxes[sandboxID]
|
||
m.mu.RUnlock()
|
||
if !ok {
|
||
return nil, fmt.Errorf("%w: %s", ErrNotFound, sandboxID)
|
||
}
|
||
if sb.ring == nil {
|
||
return nil, nil
|
||
}
|
||
|
||
// Map the requested range to the appropriate ring tier and time cutoff.
|
||
var points []MetricPoint
|
||
var cutoff time.Duration
|
||
switch rangeTier {
|
||
case "5m":
|
||
points = sb.ring.Get10m()
|
||
cutoff = 5 * time.Minute
|
||
case "10m":
|
||
points = sb.ring.Get10m()
|
||
cutoff = 10 * time.Minute
|
||
case "1h":
|
||
points = sb.ring.Get2h()
|
||
cutoff = 1 * time.Hour
|
||
case "2h":
|
||
points = sb.ring.Get2h()
|
||
cutoff = 2 * time.Hour
|
||
case "6h":
|
||
points = sb.ring.Get24h()
|
||
cutoff = 6 * time.Hour
|
||
case "12h":
|
||
points = sb.ring.Get24h()
|
||
cutoff = 12 * time.Hour
|
||
case "24h":
|
||
points = sb.ring.Get24h()
|
||
cutoff = 24 * time.Hour
|
||
default:
|
||
return nil, fmt.Errorf("%w: %s (valid: 5m, 10m, 1h, 2h, 6h, 12h, 24h)", ErrInvalidRange, rangeTier)
|
||
}
|
||
|
||
// Filter points to the requested time window.
|
||
threshold := time.Now().Add(-cutoff)
|
||
filtered := points[:0:0]
|
||
for _, p := range points {
|
||
if !p.Timestamp.Before(threshold) {
|
||
filtered = append(filtered, p)
|
||
}
|
||
}
|
||
return filtered, nil
|
||
}
|
||
|
||
// FlushMetrics returns all three tier ring buffers, clears the ring, and
|
||
// stops the sampler goroutine. Called by the control plane before pause/destroy.
|
||
func (m *Manager) FlushMetrics(sandboxID string) (pts10m, pts2h, pts24h []MetricPoint, err error) {
|
||
m.mu.RLock()
|
||
sb, ok := m.boxes[sandboxID]
|
||
m.mu.RUnlock()
|
||
if !ok {
|
||
return nil, nil, nil, fmt.Errorf("%w: %s", ErrNotFound, sandboxID)
|
||
}
|
||
|
||
m.stopSampler(sb)
|
||
if sb.ring == nil {
|
||
return nil, nil, nil, nil
|
||
}
|
||
pts10m, pts2h, pts24h = sb.ring.Flush()
|
||
return pts10m, pts2h, pts24h, nil
|
||
}
|