forked from wrenn/wrenn
Fix resource leaks, race conditions, and error handling across host agent and control plane: proper sparse file cleanup on close error, connect error wrapping for MakeDir, CoW file cleanup on pause failure, per-sandbox VM directories, deferred map deletion to avoid race in VM destroy, and goroutine launch for extension background workers.
1818 lines
58 KiB
Go
1818 lines
58 KiB
Go
package sandbox
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"log/slog"
|
|
"net"
|
|
"os"
|
|
"os/exec"
|
|
"path/filepath"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/jackc/pgx/v5/pgtype"
|
|
|
|
"git.omukk.dev/wrenn/wrenn/internal/devicemapper"
|
|
"git.omukk.dev/wrenn/wrenn/internal/envdclient"
|
|
"git.omukk.dev/wrenn/wrenn/internal/layout"
|
|
"git.omukk.dev/wrenn/wrenn/internal/models"
|
|
"git.omukk.dev/wrenn/wrenn/internal/network"
|
|
"git.omukk.dev/wrenn/wrenn/internal/snapshot"
|
|
"git.omukk.dev/wrenn/wrenn/internal/vm"
|
|
"git.omukk.dev/wrenn/wrenn/pkg/id"
|
|
envdpb "git.omukk.dev/wrenn/wrenn/proto/envd/gen"
|
|
)
|
|
|
|
// Config holds the paths and defaults for the sandbox manager.
|
|
type Config struct {
|
|
WrennDir string // root directory (e.g. /var/lib/wrenn); all sub-paths derived via layout package
|
|
EnvdTimeout time.Duration
|
|
DefaultRootfsSizeMB int // target size for template rootfs images; 0 → DefaultDiskSizeMB
|
|
|
|
// Resolved at startup by the host agent.
|
|
KernelPath string // path to the latest vmlinux-x.y.z
|
|
KernelVersion string // semver extracted from filename
|
|
VMMBin string // path to the cloud-hypervisor binary
|
|
VMMVersion string // semver from cloud-hypervisor --version
|
|
AgentVersion string // host agent version (injected via ldflags)
|
|
}
|
|
|
|
// LifecycleEvent describes an autonomous state change initiated by the agent.
|
|
type LifecycleEvent struct {
|
|
Event string
|
|
SandboxID string
|
|
}
|
|
|
|
// EventSender sends autonomous lifecycle events to the control plane.
|
|
type EventSender interface {
|
|
SendAsync(event LifecycleEvent)
|
|
}
|
|
|
|
// Manager orchestrates sandbox lifecycle: VM, network, filesystem, envd.
|
|
type Manager struct {
|
|
cfg Config
|
|
vm *vm.Manager
|
|
slots *network.SlotAllocator
|
|
loops *devicemapper.LoopRegistry
|
|
mu sync.RWMutex
|
|
boxes map[string]*sandboxState
|
|
stopCh chan struct{}
|
|
|
|
autoPausedMu sync.Mutex
|
|
autoPausedIDs []string
|
|
|
|
// onDestroy is called with the sandbox ID after cleanup completes.
|
|
// Used by ProxyHandler to evict cached reverse proxies.
|
|
onDestroy func(sandboxID string)
|
|
|
|
// eventSender sends autonomous lifecycle events (auto-pause, auto-destroy)
|
|
// to the CP via HTTP callback. Optional — nil means events are only
|
|
// propagated through the HostMonitor reconciler.
|
|
eventSender EventSender
|
|
}
|
|
|
|
// SetOnDestroy registers a callback invoked after each sandbox is cleaned up.
|
|
func (m *Manager) SetOnDestroy(fn func(sandboxID string)) {
|
|
m.onDestroy = fn
|
|
}
|
|
|
|
// SetEventSender registers the callback sender for autonomous lifecycle events.
|
|
func (m *Manager) SetEventSender(sender EventSender) {
|
|
m.eventSender = sender
|
|
}
|
|
|
|
// sandboxState holds the runtime state for a single sandbox.
|
|
type sandboxState struct {
|
|
models.Sandbox
|
|
lifecycleMu sync.Mutex // serializes Pause/Destroy/Resume on this sandbox
|
|
slot *network.Slot
|
|
client *envdclient.Client
|
|
connTracker *ConnTracker // tracks in-flight proxy connections for pre-pause drain
|
|
dmDevice *devicemapper.SnapshotDevice
|
|
baseImagePath string // path to the base template rootfs (for loop registry release)
|
|
|
|
// Metrics sampling state.
|
|
vmmPID int // VMM process PID (child of unshare wrapper)
|
|
ring *metricsRing // tiered ring buffers for CPU/mem/disk metrics
|
|
samplerCancel context.CancelFunc // cancels the per-sandbox sampling goroutine
|
|
samplerDone chan struct{} // closed when the sampling goroutine exits
|
|
}
|
|
|
|
// buildMetadata constructs the metadata map with version information.
|
|
func (m *Manager) buildMetadata(envdVersion string) map[string]string {
|
|
meta := map[string]string{
|
|
"kernel_version": m.cfg.KernelVersion,
|
|
"vmm_version": m.cfg.VMMVersion,
|
|
"agent_version": m.cfg.AgentVersion,
|
|
}
|
|
if envdVersion != "" {
|
|
meta["envd_version"] = envdVersion
|
|
}
|
|
return meta
|
|
}
|
|
|
|
// resolveKernelPath returns the kernel path for the given version hint.
|
|
// If the exact version exists on disk, it is used. Otherwise, falls back to
|
|
// the latest kernel (m.cfg.KernelPath).
|
|
func (m *Manager) resolveKernelPath(versionHint string) string {
|
|
if versionHint == "" {
|
|
return m.cfg.KernelPath
|
|
}
|
|
exact := layout.KernelPathVersioned(m.cfg.WrennDir, versionHint)
|
|
if _, err := os.Stat(exact); err == nil {
|
|
return exact
|
|
}
|
|
slog.Warn("requested kernel version not found, using latest",
|
|
"requested", versionHint, "latest", m.cfg.KernelVersion)
|
|
return m.cfg.KernelPath
|
|
}
|
|
|
|
// New creates a new sandbox manager.
|
|
func New(cfg Config) *Manager {
|
|
if cfg.EnvdTimeout == 0 {
|
|
cfg.EnvdTimeout = 30 * time.Second
|
|
}
|
|
return &Manager{
|
|
cfg: cfg,
|
|
vm: vm.NewManager(),
|
|
slots: network.NewSlotAllocator(),
|
|
loops: devicemapper.NewLoopRegistry(),
|
|
boxes: make(map[string]*sandboxState),
|
|
stopCh: make(chan struct{}),
|
|
}
|
|
}
|
|
|
|
// Create boots a new sandbox: clone rootfs, set up network, start VM, wait for envd.
|
|
// If sandboxID is empty, a new ID is generated.
|
|
func (m *Manager) Create(ctx context.Context, sandboxID string, teamID, templateID pgtype.UUID, vcpus, memoryMB, timeoutSec, diskSizeMB int) (*models.Sandbox, error) {
|
|
if sandboxID == "" {
|
|
sandboxID = id.FormatSandboxID(id.NewSandboxID())
|
|
}
|
|
|
|
if vcpus <= 0 {
|
|
vcpus = 1
|
|
}
|
|
if memoryMB <= 0 {
|
|
memoryMB = 512
|
|
}
|
|
if diskSizeMB <= 0 {
|
|
diskSizeMB = 5120 // 5 GB default
|
|
}
|
|
|
|
// Check if template refers to a CH snapshot (has config.json).
|
|
tmplDir := layout.TemplateDir(m.cfg.WrennDir, teamID, templateID)
|
|
if _, err := os.Stat(filepath.Join(tmplDir, snapshot.CHConfigFile)); err == nil {
|
|
return m.createFromSnapshot(ctx, sandboxID, teamID, templateID, vcpus, memoryMB, timeoutSec, diskSizeMB)
|
|
}
|
|
|
|
// Resolve base rootfs image.
|
|
baseRootfs := layout.TemplateRootfs(m.cfg.WrennDir, teamID, templateID)
|
|
if _, err := os.Stat(baseRootfs); err != nil {
|
|
return nil, fmt.Errorf("base rootfs not found at %s: %w", baseRootfs, err)
|
|
}
|
|
|
|
// Acquire shared read-only loop device for the base image.
|
|
originLoop, err := m.loops.Acquire(baseRootfs)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("acquire loop device: %w", err)
|
|
}
|
|
|
|
originSize, err := devicemapper.OriginSizeBytes(originLoop)
|
|
if err != nil {
|
|
m.loops.Release(baseRootfs)
|
|
return nil, fmt.Errorf("get origin size: %w", err)
|
|
}
|
|
|
|
// Create dm-snapshot with per-sandbox CoW file.
|
|
// CoW must be at least as large as the origin — if every block is
|
|
// rewritten, the CoW stores a full copy. Undersized CoW causes
|
|
// dm-snapshot invalidation → EIO on all guest I/O.
|
|
dmName := "wrenn-" + sandboxID
|
|
cowPath := filepath.Join(layout.SandboxesDir(m.cfg.WrennDir), fmt.Sprintf("%s.cow", sandboxID))
|
|
cowSize := max(int64(diskSizeMB)*1024*1024, originSize)
|
|
dmDev, err := devicemapper.CreateSnapshot(dmName, originLoop, cowPath, originSize, cowSize)
|
|
if err != nil {
|
|
m.loops.Release(baseRootfs)
|
|
return nil, fmt.Errorf("create dm-snapshot: %w", err)
|
|
}
|
|
|
|
// Allocate network slot.
|
|
slotIdx, err := m.slots.Allocate()
|
|
if err != nil {
|
|
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev))
|
|
os.Remove(cowPath)
|
|
m.loops.Release(baseRootfs)
|
|
return nil, fmt.Errorf("allocate network slot: %w", err)
|
|
}
|
|
slot := network.NewSlot(slotIdx)
|
|
|
|
// Set up network.
|
|
if err := network.CreateNetwork(slot); err != nil {
|
|
m.slots.Release(slotIdx)
|
|
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev))
|
|
os.Remove(cowPath)
|
|
m.loops.Release(baseRootfs)
|
|
return nil, fmt.Errorf("create network: %w", err)
|
|
}
|
|
|
|
// Boot VM — CH gets the dm device path.
|
|
vmCfg := vm.VMConfig{
|
|
SandboxID: sandboxID,
|
|
TemplateID: id.UUIDString(templateID),
|
|
KernelPath: m.cfg.KernelPath,
|
|
RootfsPath: dmDev.DevicePath,
|
|
VCPUs: vcpus,
|
|
MemoryMB: memoryMB,
|
|
NetworkNamespace: slot.NamespaceID,
|
|
TapDevice: slot.TapName,
|
|
TapMAC: slot.TapMAC,
|
|
GuestIP: slot.GuestIP,
|
|
GatewayIP: slot.TapIP,
|
|
NetMask: slot.GuestNetMask,
|
|
VMMBin: m.cfg.VMMBin,
|
|
}
|
|
|
|
if _, err := m.vm.Create(ctx, vmCfg); err != nil {
|
|
warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot))
|
|
m.slots.Release(slotIdx)
|
|
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev))
|
|
os.Remove(cowPath)
|
|
m.loops.Release(baseRootfs)
|
|
return nil, fmt.Errorf("create VM: %w", err)
|
|
}
|
|
|
|
// Wait for envd to be ready.
|
|
client := envdclient.New(slot.HostIP.String())
|
|
waitCtx, waitCancel := context.WithTimeout(ctx, m.cfg.EnvdTimeout)
|
|
defer waitCancel()
|
|
|
|
if err := client.WaitUntilReady(waitCtx); err != nil {
|
|
warnErr("vm destroy error", sandboxID, m.vm.Destroy(context.Background(), sandboxID))
|
|
warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot))
|
|
m.slots.Release(slotIdx)
|
|
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev))
|
|
os.Remove(cowPath)
|
|
m.loops.Release(baseRootfs)
|
|
return nil, fmt.Errorf("wait for envd: %w", err)
|
|
}
|
|
|
|
// Fetch envd version (best-effort).
|
|
envdVersion, _ := client.FetchVersion(ctx)
|
|
|
|
now := time.Now()
|
|
sb := &sandboxState{
|
|
Sandbox: models.Sandbox{
|
|
ID: sandboxID,
|
|
Status: models.StatusRunning,
|
|
TemplateTeamID: teamID.Bytes,
|
|
TemplateID: templateID.Bytes,
|
|
VCPUs: vcpus,
|
|
MemoryMB: memoryMB,
|
|
TimeoutSec: timeoutSec,
|
|
SlotIndex: slotIdx,
|
|
HostIP: slot.HostIP,
|
|
RootfsPath: dmDev.DevicePath,
|
|
CreatedAt: now,
|
|
LastActiveAt: now,
|
|
Metadata: m.buildMetadata(envdVersion),
|
|
},
|
|
slot: slot,
|
|
client: client,
|
|
connTracker: &ConnTracker{},
|
|
dmDevice: dmDev,
|
|
baseImagePath: baseRootfs,
|
|
}
|
|
|
|
m.mu.Lock()
|
|
m.boxes[sandboxID] = sb
|
|
m.mu.Unlock()
|
|
|
|
m.startSampler(sb)
|
|
m.startCrashWatcher(sb)
|
|
|
|
slog.Info("sandbox created",
|
|
"id", sandboxID,
|
|
"team_id", teamID,
|
|
"template_id", templateID,
|
|
"host_ip", slot.HostIP.String(),
|
|
"dm_device", dmDev.DevicePath,
|
|
)
|
|
|
|
return &sb.Sandbox, nil
|
|
}
|
|
|
|
// Destroy stops and cleans up a sandbox. If the sandbox is running, its VM,
|
|
// network, and rootfs are torn down. Any pause snapshot files are also removed.
|
|
func (m *Manager) Destroy(ctx context.Context, sandboxID string) error {
|
|
m.mu.Lock()
|
|
sb, ok := m.boxes[sandboxID]
|
|
if ok {
|
|
delete(m.boxes, sandboxID)
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
if ok {
|
|
// Wait for any in-progress Pause to finish before tearing down resources.
|
|
sb.lifecycleMu.Lock()
|
|
defer sb.lifecycleMu.Unlock()
|
|
m.cleanup(ctx, sb)
|
|
}
|
|
|
|
// Always clean up pause snapshot files (may exist if sandbox was paused).
|
|
if err := os.RemoveAll(layout.PauseSnapshotDir(m.cfg.WrennDir, sandboxID)); err != nil {
|
|
slog.Warn("snapshot cleanup error", "id", sandboxID, "error", err)
|
|
}
|
|
|
|
if m.onDestroy != nil {
|
|
m.onDestroy(sandboxID)
|
|
}
|
|
|
|
slog.Info("sandbox destroyed", "id", sandboxID)
|
|
return nil
|
|
}
|
|
|
|
// cleanup tears down all resources for a sandbox.
|
|
func (m *Manager) cleanup(ctx context.Context, sb *sandboxState) {
|
|
m.stopSampler(sb)
|
|
if err := m.vm.Destroy(ctx, sb.ID); err != nil {
|
|
slog.Warn("vm destroy error", "id", sb.ID, "error", err)
|
|
}
|
|
if err := network.RemoveNetwork(sb.slot); err != nil {
|
|
slog.Warn("network cleanup error", "id", sb.ID, "error", err)
|
|
}
|
|
m.slots.Release(sb.SlotIndex)
|
|
|
|
// Tear down dm-snapshot and release the base image loop device.
|
|
if sb.dmDevice != nil {
|
|
if err := devicemapper.RemoveSnapshot(context.Background(), sb.dmDevice); err != nil {
|
|
slog.Warn("dm-snapshot remove error", "id", sb.ID, "error", err)
|
|
}
|
|
os.Remove(sb.dmDevice.CowPath)
|
|
}
|
|
if sb.baseImagePath != "" {
|
|
m.loops.Release(sb.baseImagePath)
|
|
}
|
|
}
|
|
|
|
// Pause takes a snapshot of a running sandbox, then destroys all resources.
|
|
// The sandbox's snapshot files are stored at SnapshotsDir/{sandboxID}/.
|
|
// After this call, the sandbox is no longer running but can be resumed.
|
|
func (m *Manager) Pause(ctx context.Context, sandboxID string) error {
|
|
sb, err := m.get(sandboxID)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// Serialize lifecycle operations on this sandbox to prevent concurrent
|
|
// Pause/Destroy calls from corrupting VM state.
|
|
sb.lifecycleMu.Lock()
|
|
defer sb.lifecycleMu.Unlock()
|
|
|
|
if sb.Status != models.StatusRunning {
|
|
return fmt.Errorf("sandbox %s is not running (status: %s)", sandboxID, sb.Status)
|
|
}
|
|
|
|
// Mark sandbox as pausing to block new exec/file/PTY operations.
|
|
m.mu.Lock()
|
|
sb.Status = models.StatusPausing
|
|
m.mu.Unlock()
|
|
|
|
// restoreRunning reverts state if any pre-freeze step fails.
|
|
restoreRunning := func() {
|
|
_ = m.vm.UpdateBalloon(context.Background(), sandboxID, 0)
|
|
sb.connTracker.Reset()
|
|
m.mu.Lock()
|
|
sb.Status = models.StatusRunning
|
|
m.mu.Unlock()
|
|
m.startSampler(sb)
|
|
}
|
|
|
|
// Stop the metrics sampler goroutine before tearing down any resources
|
|
// it reads (dm device, VMM PID). Without this, the sampler
|
|
// leaks on every successful pause.
|
|
m.stopSampler(sb)
|
|
|
|
// ── Step 1: Isolate from external traffic ─────────────────────────
|
|
// Drain in-flight proxy connections (grace period for clean shutdown).
|
|
sb.connTracker.Drain(5 * time.Second)
|
|
// Force-close any connections that didn't finish during grace period.
|
|
sb.connTracker.ForceClose()
|
|
slog.Debug("pause: external connections closed", "id", sandboxID)
|
|
|
|
// Close host-side idle connections to envd so FIN packets propagate
|
|
// to the guest kernel before snapshot.
|
|
sb.client.CloseIdleConnections()
|
|
|
|
// ── Step 2: Drop page cache ──────────────────────────────────────
|
|
// Signal envd to quiesce: drops page cache, stops port subsystem,
|
|
// marks connections for post-restore cleanup. Page cache drop can
|
|
// take significant time on large-memory VMs (20GB+).
|
|
func() {
|
|
prepCtx, prepCancel := context.WithTimeout(ctx, 30*time.Second)
|
|
defer prepCancel()
|
|
if err := sb.client.PrepareSnapshot(prepCtx); err != nil {
|
|
slog.Warn("pause: pre-snapshot quiesce failed (best-effort)", "id", sandboxID, "error", err)
|
|
} else {
|
|
slog.Debug("pause: envd quiesced", "id", sandboxID)
|
|
}
|
|
}()
|
|
|
|
// ── Step 3: Inflate balloon to reclaim free guest memory ─────────
|
|
// Freed pages become zero in the snapshot's memory-ranges file.
|
|
// CH v52+ writes sparse snapshots natively (SEEK_DATA/SEEK_HOLE).
|
|
func() {
|
|
memUsed, err := readEnvdMemUsed(ctx, sb.client)
|
|
if err != nil {
|
|
slog.Debug("pause: could not read guest memory, skipping balloon inflate", "id", sandboxID, "error", err)
|
|
return
|
|
}
|
|
usedMiB := int(memUsed / (1024 * 1024))
|
|
keepMiB := max(usedMiB*3/2, 512)
|
|
inflateMiB := sb.MemoryMB - keepMiB
|
|
if inflateMiB <= 0 {
|
|
slog.Debug("pause: not enough free memory for balloon inflate", "id", sandboxID, "used_mib", usedMiB, "total_mib", sb.MemoryMB)
|
|
return
|
|
}
|
|
balloonCtx, balloonCancel := context.WithTimeout(ctx, 10*time.Second)
|
|
defer balloonCancel()
|
|
if err := m.vm.UpdateBalloon(balloonCtx, sandboxID, inflateMiB); err != nil {
|
|
slog.Debug("pause: balloon inflate failed (non-fatal)", "id", sandboxID, "error", err)
|
|
return
|
|
}
|
|
time.Sleep(2 * time.Second)
|
|
slog.Info("pause: balloon inflated", "id", sandboxID, "inflate_mib", inflateMiB, "guest_used_mib", usedMiB)
|
|
}()
|
|
|
|
// ── Step 4: Freeze vCPUs ─────────────────────────────────────────
|
|
pauseStart := time.Now()
|
|
|
|
if err := m.vm.Pause(ctx, sandboxID); err != nil {
|
|
restoreRunning()
|
|
return fmt.Errorf("pause VM: %w", err)
|
|
}
|
|
slog.Debug("pause: VM paused", "id", sandboxID, "elapsed", time.Since(pauseStart))
|
|
|
|
// resumeOnError unpauses the VM so the sandbox stays usable when a
|
|
// post-freeze step fails. If the resume itself fails, the sandbox is
|
|
// frozen and unrecoverable — destroy it to avoid a zombie.
|
|
resumeOnError := func() {
|
|
resumeCtx, resumeCancel := context.WithTimeout(context.Background(), 30*time.Second)
|
|
defer resumeCancel()
|
|
if err := m.vm.Resume(resumeCtx, sandboxID); err != nil {
|
|
slog.Error("failed to resume VM after pause error — destroying frozen sandbox", "id", sandboxID, "error", err)
|
|
m.cleanup(context.Background(), sb)
|
|
m.mu.Lock()
|
|
delete(m.boxes, sandboxID)
|
|
m.mu.Unlock()
|
|
if m.onDestroy != nil {
|
|
m.onDestroy(sandboxID)
|
|
}
|
|
return
|
|
}
|
|
restoreRunning()
|
|
}
|
|
|
|
// ── Step 5: Take CH snapshot ─────────────────────────────────────
|
|
// Snapshot to a temp dir first. If the sandbox was previously resumed,
|
|
// the old pauseDir still contains memory-ranges that CH's uffd handler
|
|
// is lazily paging from. CH refuses to overwrite existing files (EEXIST),
|
|
// so we write to a fresh dir and swap after the VM is destroyed.
|
|
pauseDir := layout.PauseSnapshotDir(m.cfg.WrennDir, sandboxID)
|
|
tmpPauseDir := pauseDir + ".new"
|
|
if err := os.RemoveAll(tmpPauseDir); err != nil {
|
|
resumeOnError()
|
|
return fmt.Errorf("clean temp snapshot dir: %w", err)
|
|
}
|
|
if err := os.MkdirAll(tmpPauseDir, 0755); err != nil {
|
|
resumeOnError()
|
|
return fmt.Errorf("create snapshot dir: %w", err)
|
|
}
|
|
|
|
snapshotStart := time.Now()
|
|
if err := m.vm.Snapshot(ctx, sandboxID, tmpPauseDir); err != nil {
|
|
slog.Error("pause: snapshot failed", "id", sandboxID, "elapsed", time.Since(snapshotStart), "error", err)
|
|
warnErr("snapshot dir cleanup error", sandboxID, os.RemoveAll(tmpPauseDir))
|
|
resumeOnError()
|
|
return fmt.Errorf("create VM snapshot: %w", err)
|
|
}
|
|
slog.Debug("pause: CH snapshot created", "id", sandboxID, "elapsed", time.Since(snapshotStart))
|
|
|
|
// ── Step 6: Destroy the VM so CH releases the dm device ──────────
|
|
if err := m.vm.Destroy(ctx, sb.ID); err != nil {
|
|
slog.Warn("vm destroy error during pause", "id", sb.ID, "error", err)
|
|
}
|
|
|
|
// CH process is dead — uffd handler no longer reads old memory-ranges.
|
|
// Replace old snapshot dir with new one.
|
|
if err := os.RemoveAll(pauseDir); err != nil {
|
|
slog.Warn("pause: failed to remove old snapshot dir", "id", sandboxID, "error", err)
|
|
}
|
|
if err := os.Rename(tmpPauseDir, pauseDir); err != nil {
|
|
warnErr("network cleanup error during pause", sandboxID, network.RemoveNetwork(sb.slot))
|
|
m.slots.Release(sb.SlotIndex)
|
|
if sb.dmDevice != nil {
|
|
warnErr("dm-snapshot remove error during pause", sandboxID, devicemapper.RemoveSnapshot(context.Background(), sb.dmDevice))
|
|
os.Remove(sb.dmDevice.CowPath)
|
|
}
|
|
if sb.baseImagePath != "" {
|
|
m.loops.Release(sb.baseImagePath)
|
|
}
|
|
m.mu.Lock()
|
|
delete(m.boxes, sandboxID)
|
|
m.mu.Unlock()
|
|
return fmt.Errorf("rename snapshot dir: %w", err)
|
|
}
|
|
|
|
// ── Step 7: Remove dm-snapshot and save CoW ──────────────────────
|
|
if sb.dmDevice != nil {
|
|
if err := devicemapper.RemoveSnapshot(ctx, sb.dmDevice); err != nil {
|
|
warnErr("network cleanup error during pause", sandboxID, network.RemoveNetwork(sb.slot))
|
|
m.slots.Release(sb.SlotIndex)
|
|
warnErr("snapshot dir cleanup error", sandboxID, os.RemoveAll(pauseDir))
|
|
m.mu.Lock()
|
|
delete(m.boxes, sandboxID)
|
|
m.mu.Unlock()
|
|
return fmt.Errorf("remove dm-snapshot: %w", err)
|
|
}
|
|
|
|
snapshotCow := snapshot.CowPath(pauseDir, "")
|
|
if err := os.Rename(sb.dmDevice.CowPath, snapshotCow); err != nil {
|
|
warnErr("snapshot dir cleanup error", sandboxID, os.RemoveAll(pauseDir))
|
|
warnErr("network cleanup error during pause", sandboxID, network.RemoveNetwork(sb.slot))
|
|
m.slots.Release(sb.SlotIndex)
|
|
os.Remove(sb.dmDevice.CowPath)
|
|
if sb.baseImagePath != "" {
|
|
m.loops.Release(sb.baseImagePath)
|
|
}
|
|
m.mu.Lock()
|
|
delete(m.boxes, sandboxID)
|
|
m.mu.Unlock()
|
|
return fmt.Errorf("move cow file: %w", err)
|
|
}
|
|
|
|
if err := snapshot.WriteMeta(pauseDir, "", &snapshot.RootfsMeta{
|
|
BaseTemplate: sb.baseImagePath,
|
|
TemplateID: id.UUIDString(pgtype.UUID{Bytes: sb.TemplateID, Valid: true}),
|
|
VCPUs: sb.VCPUs,
|
|
MemoryMB: sb.MemoryMB,
|
|
}); err != nil {
|
|
warnErr("snapshot dir cleanup error", sandboxID, os.RemoveAll(pauseDir))
|
|
warnErr("network cleanup error during pause", sandboxID, network.RemoveNetwork(sb.slot))
|
|
m.slots.Release(sb.SlotIndex)
|
|
if sb.baseImagePath != "" {
|
|
m.loops.Release(sb.baseImagePath)
|
|
}
|
|
m.mu.Lock()
|
|
delete(m.boxes, sandboxID)
|
|
m.mu.Unlock()
|
|
return fmt.Errorf("write rootfs meta: %w", err)
|
|
}
|
|
}
|
|
|
|
// ── Step 8: Clean up remaining resources ─────────────────────────
|
|
if err := network.RemoveNetwork(sb.slot); err != nil {
|
|
slog.Warn("network cleanup error during pause", "id", sb.ID, "error", err)
|
|
}
|
|
m.slots.Release(sb.SlotIndex)
|
|
if sb.baseImagePath != "" {
|
|
m.loops.Release(sb.baseImagePath)
|
|
}
|
|
|
|
m.mu.Lock()
|
|
delete(m.boxes, sandboxID)
|
|
m.mu.Unlock()
|
|
|
|
slog.Info("sandbox paused", "id", sandboxID, "total_elapsed", time.Since(pauseStart))
|
|
return nil
|
|
}
|
|
|
|
// Resume restores a paused sandbox from its CH snapshot.
|
|
// CH handles memory restore internally (with on-demand paging).
|
|
// The sandbox gets a new network slot.
|
|
// Optional defaultUser and defaultEnv are applied via PostInit with
|
|
// sandbox_id and template_id so envd picks up the new sandbox's metadata.
|
|
func (m *Manager) Resume(ctx context.Context, sandboxID string, timeoutSec int, kernelVersion string, defaultUser string, defaultEnv map[string]string) (*models.Sandbox, error) {
|
|
pauseDir := layout.PauseSnapshotDir(m.cfg.WrennDir, sandboxID)
|
|
if _, err := os.Stat(pauseDir); err != nil {
|
|
return nil, fmt.Errorf("no snapshot found for sandbox %s", sandboxID)
|
|
}
|
|
|
|
// Read rootfs metadata to find the base template image.
|
|
meta, err := snapshot.ReadMeta(pauseDir, "")
|
|
if err != nil {
|
|
return nil, fmt.Errorf("read rootfs meta: %w", err)
|
|
}
|
|
|
|
// Acquire the base image loop device and restore dm-snapshot from saved CoW.
|
|
baseImagePath := meta.BaseTemplate
|
|
originLoop, err := m.loops.Acquire(baseImagePath)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("acquire loop device: %w", err)
|
|
}
|
|
|
|
originSize, err := devicemapper.OriginSizeBytes(originLoop)
|
|
if err != nil {
|
|
m.loops.Release(baseImagePath)
|
|
return nil, fmt.Errorf("get origin size: %w", err)
|
|
}
|
|
|
|
// Move CoW file from snapshot dir to sandboxes dir for the running sandbox.
|
|
savedCow := snapshot.CowPath(pauseDir, "")
|
|
cowPath := filepath.Join(layout.SandboxesDir(m.cfg.WrennDir), fmt.Sprintf("%s.cow", sandboxID))
|
|
if err := os.Rename(savedCow, cowPath); err != nil {
|
|
m.loops.Release(baseImagePath)
|
|
return nil, fmt.Errorf("move cow file: %w", err)
|
|
}
|
|
|
|
rollbackCow := func() {
|
|
if err := os.Rename(cowPath, savedCow); err != nil {
|
|
slog.Warn("failed to rollback cow file", "src", cowPath, "dst", savedCow, "error", err)
|
|
}
|
|
}
|
|
|
|
// Restore dm-snapshot from existing persistent CoW file.
|
|
dmName := "wrenn-" + sandboxID
|
|
dmDev, err := devicemapper.RestoreSnapshot(ctx, dmName, originLoop, cowPath, originSize)
|
|
if err != nil {
|
|
m.loops.Release(baseImagePath)
|
|
rollbackCow()
|
|
return nil, fmt.Errorf("restore dm-snapshot: %w", err)
|
|
}
|
|
|
|
// Allocate network slot.
|
|
slotIdx, err := m.slots.Allocate()
|
|
if err != nil {
|
|
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev))
|
|
rollbackCow()
|
|
m.loops.Release(baseImagePath)
|
|
return nil, fmt.Errorf("allocate network slot: %w", err)
|
|
}
|
|
slot := network.NewSlot(slotIdx)
|
|
|
|
if err := network.CreateNetwork(slot); err != nil {
|
|
m.slots.Release(slotIdx)
|
|
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev))
|
|
rollbackCow()
|
|
m.loops.Release(baseImagePath)
|
|
return nil, fmt.Errorf("create network: %w", err)
|
|
}
|
|
|
|
// Restore VM from CH snapshot.
|
|
vmCfg := vm.VMConfig{
|
|
SandboxID: sandboxID,
|
|
TemplateID: meta.TemplateID,
|
|
KernelPath: m.resolveKernelPath(kernelVersion),
|
|
RootfsPath: dmDev.DevicePath,
|
|
NetworkNamespace: slot.NamespaceID,
|
|
TapDevice: slot.TapName,
|
|
TapMAC: slot.TapMAC,
|
|
GuestIP: slot.GuestIP,
|
|
GatewayIP: slot.TapIP,
|
|
NetMask: slot.GuestNetMask,
|
|
VMMBin: m.cfg.VMMBin,
|
|
}
|
|
|
|
if _, err := m.vm.CreateFromSnapshot(ctx, vmCfg, pauseDir); err != nil {
|
|
warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot))
|
|
m.slots.Release(slotIdx)
|
|
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev))
|
|
rollbackCow()
|
|
m.loops.Release(baseImagePath)
|
|
return nil, fmt.Errorf("restore VM from snapshot: %w", err)
|
|
}
|
|
|
|
// Wait for envd to be ready.
|
|
client := envdclient.New(slot.HostIP.String())
|
|
waitCtx, waitCancel := context.WithTimeout(ctx, m.cfg.EnvdTimeout)
|
|
|
|
if err := client.WaitUntilReady(waitCtx); err != nil {
|
|
waitCancel()
|
|
warnErr("vm destroy error", sandboxID, m.vm.Destroy(context.Background(), sandboxID))
|
|
warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot))
|
|
m.slots.Release(slotIdx)
|
|
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev))
|
|
rollbackCow()
|
|
m.loops.Release(baseImagePath)
|
|
return nil, fmt.Errorf("wait for envd: %w", err)
|
|
}
|
|
waitCancel()
|
|
|
|
// PostInit with sandbox_id and template_id so envd sets metadata env vars.
|
|
// Fire-and-forget: post-init is non-critical (metadata/env vars), and
|
|
// blocking here widens the window for the CP monitor to race against us.
|
|
go func() {
|
|
initCtx, initCancel := context.WithTimeout(context.Background(), 10*time.Second)
|
|
defer initCancel()
|
|
if err := client.PostInitWithDefaults(initCtx, defaultUser, defaultEnv, sandboxID, meta.TemplateID); err != nil {
|
|
slog.Warn("post-init failed after resume, metadata may be stale", "sandbox", sandboxID, "error", err)
|
|
}
|
|
}()
|
|
|
|
// Deflate balloon — the snapshot was taken with an inflated balloon.
|
|
if err := m.vm.UpdateBalloon(ctx, sandboxID, 0); err != nil {
|
|
slog.Debug("resume: balloon deflate failed (non-fatal)", "id", sandboxID, "error", err)
|
|
}
|
|
|
|
// Wait for balloon deflation to settle. With OnDemand UFFD restore,
|
|
// the guest is under severe memory pressure while the balloon is still
|
|
// inflated. If the sampler or memory reclaimer runs during this window,
|
|
// the resulting page fault storm can make the guest kernel unresponsive.
|
|
if meta.MemoryMB > 0 {
|
|
threshold := int64(float64(meta.MemoryMB) * 0.80 * 1024 * 1024)
|
|
settleCtx, settleCancel := context.WithTimeout(ctx, 10*time.Second)
|
|
ticker := time.NewTicker(500 * time.Millisecond)
|
|
settled := false
|
|
for !settled {
|
|
select {
|
|
case <-settleCtx.Done():
|
|
slog.Warn("resume: balloon deflation did not settle in time", "id", sandboxID)
|
|
settled = true
|
|
case <-ticker.C:
|
|
used, err := readEnvdMemUsed(settleCtx, client)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
if used < threshold {
|
|
slog.Info("resume: balloon deflation settled", "id", sandboxID, "used_mib", used/(1024*1024))
|
|
settled = true
|
|
}
|
|
}
|
|
}
|
|
ticker.Stop()
|
|
settleCancel()
|
|
}
|
|
|
|
// Fetch envd version (best-effort).
|
|
envdVersion, _ := client.FetchVersion(ctx)
|
|
|
|
vcpus := meta.VCPUs
|
|
if vcpus <= 0 {
|
|
vcpus = 1
|
|
}
|
|
memoryMB := meta.MemoryMB
|
|
if memoryMB <= 0 {
|
|
memoryMB = 512
|
|
}
|
|
|
|
now := time.Now()
|
|
sb := &sandboxState{
|
|
Sandbox: models.Sandbox{
|
|
ID: sandboxID,
|
|
Status: models.StatusRunning,
|
|
VCPUs: vcpus,
|
|
MemoryMB: memoryMB,
|
|
TimeoutSec: timeoutSec,
|
|
SlotIndex: slotIdx,
|
|
HostIP: slot.HostIP,
|
|
RootfsPath: dmDev.DevicePath,
|
|
CreatedAt: now,
|
|
LastActiveAt: now,
|
|
Metadata: m.buildMetadata(envdVersion),
|
|
},
|
|
slot: slot,
|
|
client: client,
|
|
connTracker: &ConnTracker{},
|
|
dmDevice: dmDev,
|
|
baseImagePath: baseImagePath,
|
|
}
|
|
|
|
m.mu.Lock()
|
|
m.boxes[sandboxID] = sb
|
|
m.mu.Unlock()
|
|
|
|
m.startSampler(sb)
|
|
m.startCrashWatcher(sb)
|
|
|
|
slog.Info("sandbox resumed from snapshot",
|
|
"id", sandboxID,
|
|
"host_ip", slot.HostIP.String(),
|
|
"dm_device", dmDev.DevicePath,
|
|
"vcpus", vcpus,
|
|
)
|
|
|
|
return &sb.Sandbox, nil
|
|
}
|
|
|
|
// CreateSnapshot creates a reusable template from a sandbox. Works on both
|
|
// running and paused sandboxes. If the sandbox is running, it is paused first.
|
|
// The sandbox remains paused after this call (it can still be resumed).
|
|
//
|
|
// The rootfs is flattened (base + CoW merged) into a new standalone rootfs.ext4
|
|
// so the template has no dependency on the original base image. Memory state
|
|
// and VM snapshot files are copied as-is.
|
|
func (m *Manager) CreateSnapshot(ctx context.Context, sandboxID string, teamID, templateID pgtype.UUID) (int64, error) {
|
|
// If the sandbox is running, pause it first.
|
|
if _, err := m.get(sandboxID); err == nil {
|
|
if err := m.Pause(ctx, sandboxID); err != nil {
|
|
return 0, fmt.Errorf("pause sandbox: %w", err)
|
|
}
|
|
}
|
|
|
|
// At this point, pause snapshot files must exist.
|
|
pauseDir := layout.PauseSnapshotDir(m.cfg.WrennDir, sandboxID)
|
|
if _, err := os.Stat(pauseDir); err != nil {
|
|
return 0, fmt.Errorf("no snapshot found for sandbox %s", sandboxID)
|
|
}
|
|
|
|
// Create template directory.
|
|
dstDir := layout.TemplateDir(m.cfg.WrennDir, teamID, templateID)
|
|
if err := os.MkdirAll(dstDir, 0755); err != nil {
|
|
return 0, fmt.Errorf("create template dir: %w", err)
|
|
}
|
|
|
|
// Copy CH snapshot files (config.json, memory-ranges, state.json).
|
|
for _, fname := range []string{snapshot.CHConfigFile, snapshot.CHMemRangesFile, snapshot.CHStateFile} {
|
|
src := filepath.Join(pauseDir, fname)
|
|
dst := filepath.Join(dstDir, fname)
|
|
if _, err := os.Stat(src); err != nil {
|
|
continue // some files may not exist
|
|
}
|
|
if err := copyFile(src, dst); err != nil {
|
|
warnErr("template dir cleanup error", dstDir, os.RemoveAll(dstDir))
|
|
return 0, fmt.Errorf("copy %s: %w", fname, err)
|
|
}
|
|
}
|
|
|
|
// Flatten rootfs: temporarily set up dm device from base + CoW, dd to new image.
|
|
meta, err := snapshot.ReadMeta(pauseDir, "")
|
|
if err != nil {
|
|
warnErr("template dir cleanup error", dstDir, os.RemoveAll(dstDir))
|
|
return 0, fmt.Errorf("read rootfs meta: %w", err)
|
|
}
|
|
|
|
originLoop, err := m.loops.Acquire(meta.BaseTemplate)
|
|
if err != nil {
|
|
warnErr("template dir cleanup error", dstDir, os.RemoveAll(dstDir))
|
|
return 0, fmt.Errorf("acquire loop device for flatten: %w", err)
|
|
}
|
|
|
|
originSize, err := devicemapper.OriginSizeBytes(originLoop)
|
|
if err != nil {
|
|
m.loops.Release(meta.BaseTemplate)
|
|
warnErr("template dir cleanup error", dstDir, os.RemoveAll(dstDir))
|
|
return 0, fmt.Errorf("get origin size: %w", err)
|
|
}
|
|
|
|
// Temporarily restore the dm-snapshot to read the merged view.
|
|
cowPath := snapshot.CowPath(pauseDir, "")
|
|
tmpDmName := "wrenn-flatten-" + sandboxID
|
|
tmpDev, err := devicemapper.RestoreSnapshot(ctx, tmpDmName, originLoop, cowPath, originSize)
|
|
if err != nil {
|
|
m.loops.Release(meta.BaseTemplate)
|
|
warnErr("template dir cleanup error", dstDir, os.RemoveAll(dstDir))
|
|
return 0, fmt.Errorf("restore dm-snapshot for flatten: %w", err)
|
|
}
|
|
|
|
// Flatten to new standalone rootfs.
|
|
flattenedPath := filepath.Join(dstDir, snapshot.RootfsFileName)
|
|
flattenErr := devicemapper.FlattenSnapshot(tmpDev.DevicePath, flattenedPath)
|
|
|
|
// Always clean up the temporary dm device.
|
|
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), tmpDev))
|
|
m.loops.Release(meta.BaseTemplate)
|
|
|
|
if flattenErr != nil {
|
|
warnErr("template dir cleanup error", dstDir, os.RemoveAll(dstDir))
|
|
return 0, fmt.Errorf("flatten rootfs: %w", flattenErr)
|
|
}
|
|
|
|
sizeBytes, err := snapshot.DirSize(dstDir, "")
|
|
if err != nil {
|
|
slog.Warn("failed to calculate snapshot size", "error", err)
|
|
}
|
|
|
|
slog.Info("template snapshot created (rootfs flattened)",
|
|
"sandbox", sandboxID,
|
|
"team_id", teamID,
|
|
"template_id", templateID,
|
|
"size_bytes", sizeBytes,
|
|
)
|
|
return sizeBytes, nil
|
|
}
|
|
|
|
// FlattenRootfs stops a running sandbox, flattens its device-mapper CoW
|
|
// rootfs into a standalone rootfs.ext4, and cleans up all resources.
|
|
// The result is an image-only template (no VM memory/CPU state) stored in
|
|
// ImagesDir/{name}/rootfs.ext4.
|
|
func (m *Manager) FlattenRootfs(ctx context.Context, sandboxID string, teamID, templateID pgtype.UUID) (int64, error) {
|
|
m.mu.Lock()
|
|
sb, ok := m.boxes[sandboxID]
|
|
if ok {
|
|
delete(m.boxes, sandboxID)
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
if !ok {
|
|
return 0, fmt.Errorf("sandbox %s not found", sandboxID)
|
|
}
|
|
|
|
// Flush guest page cache to disk before stopping the VM. Without this,
|
|
// files written by the build (e.g. pip-installed packages) may exist in the
|
|
// guest's page cache but not yet on the dm block device — flatten would then
|
|
// capture 0-byte files.
|
|
func() {
|
|
syncCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
|
|
defer cancel()
|
|
if _, err := sb.client.Exec(syncCtx, "/bin/sync"); err != nil {
|
|
slog.Warn("flatten: guest sync failed (non-fatal)", "id", sb.ID, "error", err)
|
|
}
|
|
}()
|
|
|
|
// Stop the VM but keep the dm device alive for flattening.
|
|
m.stopSampler(sb)
|
|
if err := m.vm.Destroy(ctx, sb.ID); err != nil {
|
|
slog.Warn("vm destroy error during flatten", "id", sb.ID, "error", err)
|
|
}
|
|
|
|
// Release network resources — not needed after VM is stopped.
|
|
if err := network.RemoveNetwork(sb.slot); err != nil {
|
|
slog.Warn("network cleanup error during flatten", "id", sb.ID, "error", err)
|
|
}
|
|
m.slots.Release(sb.SlotIndex)
|
|
|
|
// Create template directory and flatten the dm-snapshot.
|
|
flattenDstDir := layout.TemplateDir(m.cfg.WrennDir, teamID, templateID)
|
|
if err := os.MkdirAll(flattenDstDir, 0755); err != nil {
|
|
m.cleanupDM(sb)
|
|
return 0, fmt.Errorf("create template dir: %w", err)
|
|
}
|
|
|
|
outputPath := filepath.Join(flattenDstDir, snapshot.RootfsFileName)
|
|
if sb.dmDevice == nil {
|
|
m.cleanupDM(sb)
|
|
warnErr("template dir cleanup error", flattenDstDir, os.RemoveAll(flattenDstDir))
|
|
return 0, fmt.Errorf("sandbox %s has no dm device", sandboxID)
|
|
}
|
|
|
|
if err := devicemapper.FlattenSnapshot(sb.dmDevice.DevicePath, outputPath); err != nil {
|
|
m.cleanupDM(sb)
|
|
warnErr("template dir cleanup error", flattenDstDir, os.RemoveAll(flattenDstDir))
|
|
return 0, fmt.Errorf("flatten rootfs: %w", err)
|
|
}
|
|
|
|
// Clean up dm device and loop device now that flatten is complete.
|
|
m.cleanupDM(sb)
|
|
|
|
// Shrink the flattened image to its minimum size, then re-expand to the
|
|
// configured default rootfs size so sandboxes see the full disk from boot.
|
|
if out, err := exec.Command("e2fsck", "-fy", outputPath).CombinedOutput(); err != nil {
|
|
if exitErr, ok := err.(*exec.ExitError); ok && exitErr.ExitCode() > 1 {
|
|
slog.Warn("e2fsck before shrink failed (non-fatal)", "output", string(out), "error", err)
|
|
}
|
|
}
|
|
if out, err := exec.Command("resize2fs", "-M", outputPath).CombinedOutput(); err != nil {
|
|
slog.Warn("resize2fs -M failed (non-fatal)", "output", string(out), "error", err)
|
|
}
|
|
|
|
// Re-expand to default rootfs size.
|
|
targetMB := m.cfg.DefaultRootfsSizeMB
|
|
if targetMB <= 0 {
|
|
targetMB = DefaultDiskSizeMB
|
|
}
|
|
if err := expandImage(outputPath, int64(targetMB)*1024*1024, targetMB); err != nil {
|
|
slog.Warn("failed to expand template to default size (non-fatal)", "error", err)
|
|
}
|
|
|
|
sizeBytes, err := snapshot.DirSize(flattenDstDir, "")
|
|
if err != nil {
|
|
slog.Warn("failed to calculate template size", "error", err)
|
|
}
|
|
|
|
slog.Info("rootfs flattened to image-only template",
|
|
"sandbox", sandboxID,
|
|
"team_id", teamID,
|
|
"template_id", templateID,
|
|
"size_bytes", sizeBytes,
|
|
)
|
|
return sizeBytes, nil
|
|
}
|
|
|
|
// cleanupDM tears down the dm-snapshot device and releases the base image loop device.
|
|
func (m *Manager) cleanupDM(sb *sandboxState) {
|
|
if sb.dmDevice != nil {
|
|
if err := devicemapper.RemoveSnapshot(context.Background(), sb.dmDevice); err != nil {
|
|
slog.Warn("dm-snapshot remove error", "id", sb.ID, "error", err)
|
|
}
|
|
os.Remove(sb.dmDevice.CowPath)
|
|
}
|
|
if sb.baseImagePath != "" {
|
|
m.loops.Release(sb.baseImagePath)
|
|
}
|
|
}
|
|
|
|
// DeleteSnapshot removes a snapshot template from disk.
|
|
func (m *Manager) DeleteSnapshot(teamID, templateID pgtype.UUID) error {
|
|
return os.RemoveAll(layout.TemplateDir(m.cfg.WrennDir, teamID, templateID))
|
|
}
|
|
|
|
// createFromSnapshot creates a new sandbox by restoring from a snapshot template.
|
|
// CH handles memory restore internally (with on-demand paging).
|
|
// The template's rootfs.ext4 is a flattened standalone image — we create a
|
|
// dm-snapshot on top of it just like a normal Create.
|
|
func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID string, teamID, templateID pgtype.UUID, vcpus, _, timeoutSec, diskSizeMB int) (*models.Sandbox, error) {
|
|
tmplDir := layout.TemplateDir(m.cfg.WrennDir, teamID, templateID)
|
|
|
|
// Set up dm-snapshot on the template's flattened rootfs.
|
|
baseRootfs := filepath.Join(tmplDir, snapshot.RootfsFileName)
|
|
originLoop, err := m.loops.Acquire(baseRootfs)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("acquire loop device: %w", err)
|
|
}
|
|
|
|
originSize, err := devicemapper.OriginSizeBytes(originLoop)
|
|
if err != nil {
|
|
m.loops.Release(baseRootfs)
|
|
return nil, fmt.Errorf("get origin size: %w", err)
|
|
}
|
|
|
|
dmName := "wrenn-" + sandboxID
|
|
cowPath := filepath.Join(layout.SandboxesDir(m.cfg.WrennDir), fmt.Sprintf("%s.cow", sandboxID))
|
|
cowSize := max(int64(diskSizeMB)*1024*1024, originSize)
|
|
dmDev, err := devicemapper.CreateSnapshot(dmName, originLoop, cowPath, originSize, cowSize)
|
|
if err != nil {
|
|
m.loops.Release(baseRootfs)
|
|
return nil, fmt.Errorf("create dm-snapshot: %w", err)
|
|
}
|
|
|
|
// Allocate network.
|
|
slotIdx, err := m.slots.Allocate()
|
|
if err != nil {
|
|
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev))
|
|
os.Remove(cowPath)
|
|
m.loops.Release(baseRootfs)
|
|
return nil, fmt.Errorf("allocate network slot: %w", err)
|
|
}
|
|
slot := network.NewSlot(slotIdx)
|
|
|
|
if err := network.CreateNetwork(slot); err != nil {
|
|
m.slots.Release(slotIdx)
|
|
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev))
|
|
os.Remove(cowPath)
|
|
m.loops.Release(baseRootfs)
|
|
return nil, fmt.Errorf("create network: %w", err)
|
|
}
|
|
|
|
// Restore VM from CH snapshot.
|
|
vmCfg := vm.VMConfig{
|
|
SandboxID: sandboxID,
|
|
TemplateID: id.UUIDString(templateID),
|
|
KernelPath: m.cfg.KernelPath,
|
|
RootfsPath: dmDev.DevicePath,
|
|
VCPUs: vcpus,
|
|
NetworkNamespace: slot.NamespaceID,
|
|
TapDevice: slot.TapName,
|
|
TapMAC: slot.TapMAC,
|
|
GuestIP: slot.GuestIP,
|
|
GatewayIP: slot.TapIP,
|
|
NetMask: slot.GuestNetMask,
|
|
VMMBin: m.cfg.VMMBin,
|
|
}
|
|
|
|
if _, err := m.vm.CreateFromSnapshot(ctx, vmCfg, tmplDir); err != nil {
|
|
warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot))
|
|
m.slots.Release(slotIdx)
|
|
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev))
|
|
os.Remove(cowPath)
|
|
m.loops.Release(baseRootfs)
|
|
return nil, fmt.Errorf("restore VM from snapshot: %w", err)
|
|
}
|
|
|
|
// Wait for envd.
|
|
client := envdclient.New(slot.HostIP.String())
|
|
waitCtx, waitCancel := context.WithTimeout(ctx, m.cfg.EnvdTimeout)
|
|
|
|
if err := client.WaitUntilReady(waitCtx); err != nil {
|
|
waitCancel()
|
|
warnErr("vm destroy error", sandboxID, m.vm.Destroy(context.Background(), sandboxID))
|
|
warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot))
|
|
m.slots.Release(slotIdx)
|
|
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev))
|
|
os.Remove(cowPath)
|
|
m.loops.Release(baseRootfs)
|
|
return nil, fmt.Errorf("wait for envd: %w", err)
|
|
}
|
|
waitCancel()
|
|
|
|
// PostInit with sandbox_id and template_id so envd sets metadata.
|
|
initCtx, initCancel := context.WithTimeout(ctx, m.cfg.EnvdTimeout)
|
|
defer initCancel()
|
|
|
|
if err := client.PostInitWithDefaults(initCtx, "", nil, sandboxID, id.UUIDString(templateID)); err != nil {
|
|
slog.Warn("post-init failed after template restore, metadata may be stale", "sandbox", sandboxID, "error", err)
|
|
}
|
|
|
|
// Deflate balloon — template snapshot was taken with an inflated balloon.
|
|
if err := m.vm.UpdateBalloon(ctx, sandboxID, 0); err != nil {
|
|
slog.Debug("create-from-snapshot: balloon deflate failed (non-fatal)", "id", sandboxID, "error", err)
|
|
}
|
|
|
|
// Fetch envd version (best-effort).
|
|
envdVersion, _ := client.FetchVersion(ctx)
|
|
|
|
now := time.Now()
|
|
sb := &sandboxState{
|
|
Sandbox: models.Sandbox{
|
|
ID: sandboxID,
|
|
Status: models.StatusRunning,
|
|
TemplateTeamID: teamID.Bytes,
|
|
TemplateID: templateID.Bytes,
|
|
VCPUs: vcpus,
|
|
TimeoutSec: timeoutSec,
|
|
SlotIndex: slotIdx,
|
|
HostIP: slot.HostIP,
|
|
RootfsPath: dmDev.DevicePath,
|
|
CreatedAt: now,
|
|
LastActiveAt: now,
|
|
Metadata: m.buildMetadata(envdVersion),
|
|
},
|
|
slot: slot,
|
|
client: client,
|
|
connTracker: &ConnTracker{},
|
|
dmDevice: dmDev,
|
|
baseImagePath: baseRootfs,
|
|
}
|
|
|
|
m.mu.Lock()
|
|
m.boxes[sandboxID] = sb
|
|
m.mu.Unlock()
|
|
|
|
m.startSampler(sb)
|
|
m.startCrashWatcher(sb)
|
|
|
|
slog.Info("sandbox created from snapshot",
|
|
"id", sandboxID,
|
|
"team_id", teamID,
|
|
"template_id", templateID,
|
|
"host_ip", slot.HostIP.String(),
|
|
"dm_device", dmDev.DevicePath,
|
|
)
|
|
|
|
return &sb.Sandbox, nil
|
|
}
|
|
|
|
// Exec runs a command inside a sandbox.
|
|
func (m *Manager) Exec(ctx context.Context, sandboxID string, cmd string, args ...string) (*envdclient.ExecResult, error) {
|
|
sb, err := m.get(sandboxID)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if sb.Status != models.StatusRunning {
|
|
return nil, fmt.Errorf("sandbox %s is not running (status: %s)", sandboxID, sb.Status)
|
|
}
|
|
|
|
m.mu.Lock()
|
|
sb.LastActiveAt = time.Now()
|
|
m.mu.Unlock()
|
|
|
|
return sb.client.Exec(ctx, cmd, args...)
|
|
}
|
|
|
|
// ExecStream runs a command inside a sandbox and returns a channel of streaming events.
|
|
func (m *Manager) ExecStream(ctx context.Context, sandboxID string, cmd string, args ...string) (<-chan envdclient.ExecStreamEvent, error) {
|
|
sb, err := m.get(sandboxID)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if sb.Status != models.StatusRunning {
|
|
return nil, fmt.Errorf("sandbox %s is not running (status: %s)", sandboxID, sb.Status)
|
|
}
|
|
|
|
m.mu.Lock()
|
|
sb.LastActiveAt = time.Now()
|
|
m.mu.Unlock()
|
|
|
|
return sb.client.ExecStream(ctx, cmd, args...)
|
|
}
|
|
|
|
// List returns all sandboxes.
|
|
func (m *Manager) List() []models.Sandbox {
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
|
|
result := make([]models.Sandbox, 0, len(m.boxes))
|
|
for _, sb := range m.boxes {
|
|
result = append(result, sb.Sandbox)
|
|
}
|
|
return result
|
|
}
|
|
|
|
// Get returns a sandbox by ID.
|
|
func (m *Manager) Get(sandboxID string) (*models.Sandbox, error) {
|
|
sb, err := m.get(sandboxID)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return &sb.Sandbox, nil
|
|
}
|
|
|
|
// GetClient returns the envd client for a sandbox.
|
|
func (m *Manager) GetClient(sandboxID string) (*envdclient.Client, error) {
|
|
sb, err := m.get(sandboxID)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if sb.Status != models.StatusRunning {
|
|
return nil, fmt.Errorf("sandbox %s is not running (status: %s)", sandboxID, sb.Status)
|
|
}
|
|
return sb.client, nil
|
|
}
|
|
|
|
// SetDefaults calls envd's PostInit to configure the default user and
|
|
// environment variables for a running sandbox. This is called by the host
|
|
// agent after sandbox creation or resume when the template specifies defaults.
|
|
func (m *Manager) SetDefaults(ctx context.Context, sandboxID, defaultUser string, defaultEnv map[string]string) error {
|
|
if defaultUser == "" && len(defaultEnv) == 0 {
|
|
return nil
|
|
}
|
|
sb, err := m.get(sandboxID)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if sb.Status != models.StatusRunning {
|
|
return fmt.Errorf("sandbox %s is not running (status: %s)", sandboxID, sb.Status)
|
|
}
|
|
return sb.client.PostInitWithDefaults(ctx, defaultUser, defaultEnv, "", "")
|
|
}
|
|
|
|
// PtyAttach starts a new PTY process or reconnects to an existing one.
|
|
// If cmd is non-empty, starts a new process. If empty, reconnects using tag.
|
|
func (m *Manager) PtyAttach(ctx context.Context, sandboxID, tag, cmd string, args []string, cols, rows uint32, envs map[string]string, cwd string) (<-chan envdclient.PtyEvent, error) {
|
|
sb, err := m.get(sandboxID)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if sb.Status != models.StatusRunning {
|
|
return nil, fmt.Errorf("sandbox %s is not running (status: %s)", sandboxID, sb.Status)
|
|
}
|
|
|
|
m.mu.Lock()
|
|
sb.LastActiveAt = time.Now()
|
|
m.mu.Unlock()
|
|
|
|
if cmd != "" {
|
|
return sb.client.PtyStart(ctx, tag, cmd, args, cols, rows, envs, cwd)
|
|
}
|
|
return sb.client.PtyConnect(ctx, tag)
|
|
}
|
|
|
|
// PtySendInput sends raw bytes to a PTY process in a sandbox.
|
|
func (m *Manager) PtySendInput(ctx context.Context, sandboxID, tag string, data []byte) error {
|
|
sb, err := m.get(sandboxID)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if sb.Status != models.StatusRunning {
|
|
return fmt.Errorf("sandbox %s is not running (status: %s)", sandboxID, sb.Status)
|
|
}
|
|
|
|
m.mu.Lock()
|
|
sb.LastActiveAt = time.Now()
|
|
m.mu.Unlock()
|
|
|
|
return sb.client.PtySendInput(ctx, tag, data)
|
|
}
|
|
|
|
// PtyResize updates the terminal dimensions for a PTY process in a sandbox.
|
|
func (m *Manager) PtyResize(ctx context.Context, sandboxID, tag string, cols, rows uint32) error {
|
|
sb, err := m.get(sandboxID)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if sb.Status != models.StatusRunning {
|
|
return fmt.Errorf("sandbox %s is not running (status: %s)", sandboxID, sb.Status)
|
|
}
|
|
|
|
return sb.client.PtyResize(ctx, tag, cols, rows)
|
|
}
|
|
|
|
// PtyKill sends SIGKILL to a PTY process in a sandbox.
|
|
func (m *Manager) PtyKill(ctx context.Context, sandboxID, tag string) error {
|
|
sb, err := m.get(sandboxID)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if sb.Status != models.StatusRunning {
|
|
return fmt.Errorf("sandbox %s is not running (status: %s)", sandboxID, sb.Status)
|
|
}
|
|
|
|
return sb.client.PtyKill(ctx, tag)
|
|
}
|
|
|
|
// StartBackground starts a background process inside a sandbox.
|
|
func (m *Manager) StartBackground(ctx context.Context, sandboxID, tag, cmd string, args []string, envs map[string]string, cwd string) (uint32, error) {
|
|
sb, err := m.get(sandboxID)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
if sb.Status != models.StatusRunning {
|
|
return 0, fmt.Errorf("sandbox %s is not running (status: %s)", sandboxID, sb.Status)
|
|
}
|
|
|
|
m.mu.Lock()
|
|
sb.LastActiveAt = time.Now()
|
|
m.mu.Unlock()
|
|
|
|
return sb.client.StartBackground(ctx, tag, cmd, args, envs, cwd)
|
|
}
|
|
|
|
// ConnectProcess re-attaches to a running process inside a sandbox.
|
|
func (m *Manager) ConnectProcess(ctx context.Context, sandboxID string, pid uint32, tag string) (<-chan envdclient.ExecStreamEvent, error) {
|
|
sb, err := m.get(sandboxID)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if sb.Status != models.StatusRunning {
|
|
return nil, fmt.Errorf("sandbox %s is not running (status: %s)", sandboxID, sb.Status)
|
|
}
|
|
|
|
m.mu.Lock()
|
|
sb.LastActiveAt = time.Now()
|
|
m.mu.Unlock()
|
|
|
|
return sb.client.ConnectProcess(ctx, pid, tag)
|
|
}
|
|
|
|
// ListProcesses returns all running processes inside a sandbox.
|
|
func (m *Manager) ListProcesses(ctx context.Context, sandboxID string) ([]envdclient.ProcessInfo, error) {
|
|
sb, err := m.get(sandboxID)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if sb.Status != models.StatusRunning {
|
|
return nil, fmt.Errorf("sandbox %s is not running (status: %s)", sandboxID, sb.Status)
|
|
}
|
|
|
|
m.mu.Lock()
|
|
sb.LastActiveAt = time.Now()
|
|
m.mu.Unlock()
|
|
|
|
return sb.client.ListProcesses(ctx)
|
|
}
|
|
|
|
// KillProcess sends a signal to a process inside a sandbox.
|
|
func (m *Manager) KillProcess(ctx context.Context, sandboxID string, pid uint32, tag string, signal envdpb.Signal) error {
|
|
sb, err := m.get(sandboxID)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if sb.Status != models.StatusRunning {
|
|
return fmt.Errorf("sandbox %s is not running (status: %s)", sandboxID, sb.Status)
|
|
}
|
|
|
|
m.mu.Lock()
|
|
sb.LastActiveAt = time.Now()
|
|
m.mu.Unlock()
|
|
|
|
return sb.client.KillProcess(ctx, pid, tag, signal)
|
|
}
|
|
|
|
// AcquireProxyConn atomically looks up a sandbox by ID and registers an
|
|
// in-flight proxy connection. Returns the sandbox's host-reachable IP, the
|
|
// connection tracker, and true on success. The caller must call
|
|
// tracker.Release() when the request completes. Returns zero values and
|
|
// false if the sandbox is not found, not running, or is draining for a pause.
|
|
func (m *Manager) AcquireProxyConn(sandboxID string) (net.IP, *ConnTracker, bool) {
|
|
m.mu.RLock()
|
|
sb, ok := m.boxes[sandboxID]
|
|
m.mu.RUnlock()
|
|
|
|
if !ok || sb.Status != models.StatusRunning {
|
|
return nil, nil, false
|
|
}
|
|
if !sb.connTracker.Acquire() {
|
|
return nil, nil, false
|
|
}
|
|
return sb.HostIP, sb.connTracker, true
|
|
}
|
|
|
|
// Ping resets the inactivity timer for a running sandbox.
|
|
func (m *Manager) Ping(sandboxID string) error {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
sb, ok := m.boxes[sandboxID]
|
|
if !ok {
|
|
return fmt.Errorf("sandbox not found: %s", sandboxID)
|
|
}
|
|
if sb.Status != models.StatusRunning {
|
|
return fmt.Errorf("sandbox %s is not running (status: %s)", sandboxID, sb.Status)
|
|
}
|
|
sb.LastActiveAt = time.Now()
|
|
return nil
|
|
}
|
|
|
|
// DrainAutoPausedIDs returns and clears the list of sandbox IDs that were
|
|
// automatically paused by the TTL reaper since the last call.
|
|
func (m *Manager) DrainAutoPausedIDs() []string {
|
|
m.autoPausedMu.Lock()
|
|
defer m.autoPausedMu.Unlock()
|
|
|
|
ids := m.autoPausedIDs
|
|
m.autoPausedIDs = nil
|
|
return ids
|
|
}
|
|
|
|
func (m *Manager) get(sandboxID string) (*sandboxState, error) {
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
|
|
sb, ok := m.boxes[sandboxID]
|
|
if !ok {
|
|
return nil, fmt.Errorf("sandbox not found: %s", sandboxID)
|
|
}
|
|
return sb, nil
|
|
}
|
|
|
|
// StartTTLReaper starts a background goroutine that destroys sandboxes
|
|
// that have exceeded their TTL (timeout_sec of inactivity).
|
|
func (m *Manager) StartTTLReaper(ctx context.Context) {
|
|
go func() {
|
|
ticker := time.NewTicker(2 * time.Second)
|
|
defer ticker.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-m.stopCh:
|
|
return
|
|
case <-ticker.C:
|
|
m.reapExpired(ctx)
|
|
}
|
|
}
|
|
}()
|
|
}
|
|
|
|
func (m *Manager) reapExpired(_ context.Context) {
|
|
m.mu.RLock()
|
|
var expired []string
|
|
now := time.Now()
|
|
for id, sb := range m.boxes {
|
|
if sb.TimeoutSec <= 0 {
|
|
continue
|
|
}
|
|
if sb.Status != models.StatusRunning {
|
|
continue
|
|
}
|
|
if now.Sub(sb.LastActiveAt) > time.Duration(sb.TimeoutSec)*time.Second {
|
|
expired = append(expired, id)
|
|
}
|
|
}
|
|
m.mu.RUnlock()
|
|
|
|
for _, id := range expired {
|
|
slog.Info("TTL expired, auto-pausing sandbox", "id", id)
|
|
// Use a detached context so that an app shutdown does not cancel
|
|
// a pause mid-flight, which would leave the VM frozen without a
|
|
// valid snapshot.
|
|
pauseCtx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
|
|
err := m.Pause(pauseCtx, id)
|
|
cancel()
|
|
if err != nil {
|
|
slog.Warn("TTL auto-pause failed, destroying sandbox", "id", id, "error", err)
|
|
if destroyErr := m.Destroy(context.Background(), id); destroyErr != nil {
|
|
slog.Warn("TTL destroy after failed pause also failed", "id", id, "error", destroyErr)
|
|
} else if m.eventSender != nil {
|
|
m.eventSender.SendAsync(LifecycleEvent{
|
|
Event: "sandbox.stopped",
|
|
SandboxID: id,
|
|
})
|
|
}
|
|
continue
|
|
}
|
|
m.autoPausedMu.Lock()
|
|
m.autoPausedIDs = append(m.autoPausedIDs, id)
|
|
m.autoPausedMu.Unlock()
|
|
|
|
if m.eventSender != nil {
|
|
m.eventSender.SendAsync(LifecycleEvent{
|
|
Event: "sandbox.auto_paused",
|
|
SandboxID: id,
|
|
})
|
|
}
|
|
}
|
|
}
|
|
|
|
// Shutdown destroys all sandboxes, releases loop devices, and stops the TTL reaper.
|
|
func (m *Manager) Shutdown(ctx context.Context) {
|
|
close(m.stopCh)
|
|
|
|
m.mu.Lock()
|
|
ids := make([]string, 0, len(m.boxes))
|
|
for id := range m.boxes {
|
|
ids = append(ids, id)
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
for _, sbID := range ids {
|
|
slog.Info("shutdown: destroying sandbox", "id", sbID)
|
|
if err := m.Destroy(ctx, sbID); err != nil {
|
|
slog.Warn("shutdown destroy failed", "id", sbID, "error", err)
|
|
}
|
|
}
|
|
|
|
m.loops.ReleaseAll()
|
|
}
|
|
|
|
// PauseAll pauses every running sandbox managed by this host agent.
|
|
// Called when the host loses connectivity to the control plane to avoid
|
|
// leaving running VMs unmanaged. It is best-effort: failures for individual
|
|
// sandboxes are logged but do not stop the rest.
|
|
func (m *Manager) PauseAll(ctx context.Context) {
|
|
m.mu.RLock()
|
|
ids := make([]string, 0, len(m.boxes))
|
|
for id, sb := range m.boxes {
|
|
if sb.Status == models.StatusRunning {
|
|
ids = append(ids, id)
|
|
}
|
|
}
|
|
m.mu.RUnlock()
|
|
|
|
slog.Info("pausing all running sandboxes due to CP connection loss", "count", len(ids))
|
|
for _, sbID := range ids {
|
|
if err := m.Pause(ctx, sbID); err != nil {
|
|
slog.Warn("PauseAll: failed to pause sandbox", "id", sbID, "error", err)
|
|
}
|
|
}
|
|
}
|
|
|
|
// warnErr logs a warning if err is non-nil. Used for best-effort cleanup
|
|
// in error paths where the primary error has already been captured.
|
|
func warnErr(msg string, id string, err error) {
|
|
if err != nil {
|
|
slog.Warn(msg, "id", id, "error", err)
|
|
}
|
|
}
|
|
|
|
// startCrashWatcher monitors the VM process for unexpected exits.
|
|
// If the process exits while the sandbox is still in m.boxes (i.e. not a
|
|
// deliberate Destroy), the sandbox is cleaned up and a sandbox.error event
|
|
// is pushed to the control plane.
|
|
func (m *Manager) startCrashWatcher(sb *sandboxState) {
|
|
v, ok := m.vm.Get(sb.ID)
|
|
if !ok {
|
|
return
|
|
}
|
|
go func() {
|
|
select {
|
|
case <-v.Exited():
|
|
case <-m.stopCh:
|
|
return
|
|
}
|
|
|
|
// Check if this was a deliberate Destroy/Pause (sandbox already removed
|
|
// from boxes, or Pause owns the cleanup).
|
|
m.mu.Lock()
|
|
_, stillAlive := m.boxes[sb.ID]
|
|
if stillAlive && sb.Status == models.StatusPausing {
|
|
stillAlive = false
|
|
}
|
|
if stillAlive {
|
|
delete(m.boxes, sb.ID)
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
if !stillAlive {
|
|
return
|
|
}
|
|
|
|
slog.Error("VM process crashed, cleaning up", "id", sb.ID)
|
|
|
|
sb.lifecycleMu.Lock()
|
|
m.cleanupAfterCrash(sb)
|
|
sb.lifecycleMu.Unlock()
|
|
|
|
if m.onDestroy != nil {
|
|
m.onDestroy(sb.ID)
|
|
}
|
|
|
|
if m.eventSender != nil {
|
|
m.eventSender.SendAsync(LifecycleEvent{
|
|
Event: "sandbox.error",
|
|
SandboxID: sb.ID,
|
|
})
|
|
}
|
|
}()
|
|
}
|
|
|
|
// cleanupAfterCrash tears down sandbox resources after a VM crash.
|
|
// The VM process is already dead so we skip vm.Destroy and just clean up
|
|
// network, device-mapper, and loop devices.
|
|
func (m *Manager) cleanupAfterCrash(sb *sandboxState) {
|
|
m.stopSampler(sb)
|
|
|
|
// Remove the VM from the vm.Manager's map (process is already dead).
|
|
_ = m.vm.Destroy(context.Background(), sb.ID)
|
|
|
|
if err := network.RemoveNetwork(sb.slot); err != nil {
|
|
slog.Warn("crash cleanup: network error", "id", sb.ID, "error", err)
|
|
}
|
|
m.slots.Release(sb.SlotIndex)
|
|
|
|
if sb.dmDevice != nil {
|
|
if err := devicemapper.RemoveSnapshot(context.Background(), sb.dmDevice); err != nil {
|
|
slog.Warn("crash cleanup: dm-snapshot error", "id", sb.ID, "error", err)
|
|
}
|
|
os.Remove(sb.dmDevice.CowPath)
|
|
}
|
|
if sb.baseImagePath != "" {
|
|
m.loops.Release(sb.baseImagePath)
|
|
}
|
|
}
|
|
|
|
// startSampler resolves the VMM PID and starts a background goroutine
|
|
// that samples CPU/mem/disk at 1s intervals into the ring buffer.
|
|
// Must be called after the sandbox is registered in m.boxes.
|
|
func (m *Manager) startSampler(sb *sandboxState) {
|
|
v, ok := m.vm.Get(sb.ID)
|
|
if !ok {
|
|
slog.Warn("metrics: VM not found, skipping sampler", "id", sb.ID)
|
|
return
|
|
}
|
|
|
|
// v.PID() is the cmd.Process.Pid of the "unshare -m -- bash -c script"
|
|
// invocation. The exec chain (unshare → bash → ip netns exec → cloud-hypervisor)
|
|
// occupies the same PID. v.PID() IS the VMM PID.
|
|
vmmPID := v.PID()
|
|
|
|
sb.vmmPID = vmmPID
|
|
sb.ring = newMetricsRing()
|
|
|
|
ctx, cancel := context.WithCancel(context.Background())
|
|
sb.samplerCancel = cancel
|
|
sb.samplerDone = make(chan struct{})
|
|
|
|
// Read initial CPU counters for delta calculation.
|
|
// Passed to goroutine as local state — no shared mutation.
|
|
initialCPU, err := readCPUStat(vmmPID)
|
|
if err != nil {
|
|
slog.Warn("metrics: could not read initial CPU stat", "id", sb.ID, "error", err)
|
|
}
|
|
|
|
go m.samplerLoop(ctx, sb, vmmPID, sb.VCPUs, initialCPU)
|
|
}
|
|
|
|
// samplerLoop samples metrics at 1s intervals.
|
|
// lastCPU is goroutine-local to avoid shared-state races.
|
|
func (m *Manager) samplerLoop(ctx context.Context, sb *sandboxState, vmmPID, vcpus int, lastCPU cpuStat) {
|
|
defer close(sb.samplerDone)
|
|
|
|
ticker := time.NewTicker(1 * time.Second)
|
|
defer ticker.Stop()
|
|
|
|
clkTck := 100.0 // sysconf(_SC_CLK_TCK), almost always 100 on Linux
|
|
lastTime := time.Now()
|
|
cpuInitialized := lastCPU != (cpuStat{})
|
|
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case now := <-ticker.C:
|
|
elapsed := now.Sub(lastTime).Seconds()
|
|
lastTime = now
|
|
|
|
// CPU: delta jiffies / (elapsed * CLK_TCK * vcpus) * 100
|
|
var cpuPct float64
|
|
cur, err := readCPUStat(vmmPID)
|
|
if err == nil {
|
|
if cpuInitialized && elapsed > 0 && vcpus > 0 {
|
|
deltaJiffies := float64((cur.utime + cur.stime) - (lastCPU.utime + lastCPU.stime))
|
|
cpuPct = (deltaJiffies / (elapsed * clkTck * float64(vcpus))) * 100.0
|
|
if cpuPct > 100.0 {
|
|
cpuPct = 100.0
|
|
}
|
|
if cpuPct < 0 {
|
|
cpuPct = 0
|
|
}
|
|
}
|
|
lastCPU = cur
|
|
cpuInitialized = true
|
|
}
|
|
|
|
// Memory: guest-reported used memory from envd /metrics.
|
|
// VmRSS of the VMM process includes guest page cache and never
|
|
// decreases, so we use the guest's own view which reports
|
|
// total - available (actual process memory).
|
|
memBytes, _ := readEnvdMemUsed(ctx, sb.client)
|
|
|
|
// Disk: allocated bytes of the CoW sparse file.
|
|
var diskBytes int64
|
|
if sb.dmDevice != nil {
|
|
diskBytes, _ = readDiskAllocated(sb.dmDevice.CowPath)
|
|
}
|
|
|
|
sb.ring.Push(MetricPoint{
|
|
Timestamp: now,
|
|
CPUPct: cpuPct,
|
|
MemBytes: memBytes,
|
|
DiskBytes: diskBytes,
|
|
})
|
|
}
|
|
}
|
|
}
|
|
|
|
// stopSampler stops the metrics sampling goroutine and waits for it to exit.
|
|
func (m *Manager) stopSampler(sb *sandboxState) {
|
|
if sb.samplerCancel != nil {
|
|
sb.samplerCancel()
|
|
<-sb.samplerDone
|
|
sb.samplerCancel = nil
|
|
}
|
|
}
|
|
|
|
// GetMetrics returns the ring buffer data for the given range tier.
|
|
// Valid ranges: "10m", "2h", "24h".
|
|
func (m *Manager) GetMetrics(sandboxID, rangeTier string) ([]MetricPoint, error) {
|
|
m.mu.RLock()
|
|
sb, ok := m.boxes[sandboxID]
|
|
m.mu.RUnlock()
|
|
if !ok {
|
|
return nil, fmt.Errorf("sandbox not found: %s", sandboxID)
|
|
}
|
|
if sb.ring == nil {
|
|
return nil, nil
|
|
}
|
|
|
|
// Map the requested range to the appropriate ring tier and time cutoff.
|
|
var points []MetricPoint
|
|
var cutoff time.Duration
|
|
switch rangeTier {
|
|
case "5m":
|
|
points = sb.ring.Get10m()
|
|
cutoff = 5 * time.Minute
|
|
case "10m":
|
|
points = sb.ring.Get10m()
|
|
cutoff = 10 * time.Minute
|
|
case "1h":
|
|
points = sb.ring.Get2h()
|
|
cutoff = 1 * time.Hour
|
|
case "2h":
|
|
points = sb.ring.Get2h()
|
|
cutoff = 2 * time.Hour
|
|
case "6h":
|
|
points = sb.ring.Get24h()
|
|
cutoff = 6 * time.Hour
|
|
case "12h":
|
|
points = sb.ring.Get24h()
|
|
cutoff = 12 * time.Hour
|
|
case "24h":
|
|
points = sb.ring.Get24h()
|
|
cutoff = 24 * time.Hour
|
|
default:
|
|
return nil, fmt.Errorf("invalid range: %s (valid: 5m, 10m, 1h, 2h, 6h, 12h, 24h)", rangeTier)
|
|
}
|
|
|
|
// Filter points to the requested time window.
|
|
threshold := time.Now().Add(-cutoff)
|
|
filtered := points[:0:0]
|
|
for _, p := range points {
|
|
if !p.Timestamp.Before(threshold) {
|
|
filtered = append(filtered, p)
|
|
}
|
|
}
|
|
return filtered, nil
|
|
}
|
|
|
|
// FlushMetrics returns all three tier ring buffers, clears the ring, and
|
|
// stops the sampler goroutine. Called by the control plane before pause/destroy.
|
|
func (m *Manager) FlushMetrics(sandboxID string) (pts10m, pts2h, pts24h []MetricPoint, err error) {
|
|
m.mu.RLock()
|
|
sb, ok := m.boxes[sandboxID]
|
|
m.mu.RUnlock()
|
|
if !ok {
|
|
return nil, nil, nil, fmt.Errorf("sandbox not found: %s", sandboxID)
|
|
}
|
|
|
|
m.stopSampler(sb)
|
|
if sb.ring == nil {
|
|
return nil, nil, nil, nil
|
|
}
|
|
pts10m, pts2h, pts24h = sb.ring.Flush()
|
|
return pts10m, pts2h, pts24h, nil
|
|
}
|
|
|
|
// copyFile copies a regular file from src to dst using streaming I/O.
|
|
func copyFile(src, dst string) error {
|
|
sf, err := os.Open(src)
|
|
if err != nil {
|
|
return fmt.Errorf("open %s: %w", src, err)
|
|
}
|
|
defer sf.Close()
|
|
|
|
df, err := os.Create(dst)
|
|
if err != nil {
|
|
return fmt.Errorf("create %s: %w", dst, err)
|
|
}
|
|
defer df.Close()
|
|
|
|
if _, err := df.ReadFrom(sf); err != nil {
|
|
os.Remove(dst)
|
|
return fmt.Errorf("copy %s → %s: %w", src, dst, err)
|
|
}
|
|
return nil
|
|
}
|