forked from wrenn/wrenn
- Decompose executeBuild (318 lines) into provisionBuildSandbox and finalizeBuild helpers for readability - Extract cleanupPauseFailure in sandbox manager to unify 3 inconsistent inline teardown paths (also fixes CoW file leak on rename failure) - Remove unused ctx parameter from startProcess/startProcessForRestore - Add missing MASQUERADE rollback entry in CreateNetwork for symmetry - Consolidate duplicate writeJSON for UTF-8/base64 exec response
1809 lines
58 KiB
Go
1809 lines
58 KiB
Go
package sandbox
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"log/slog"
|
|
"net"
|
|
"os"
|
|
"os/exec"
|
|
"path/filepath"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/jackc/pgx/v5/pgtype"
|
|
|
|
"git.omukk.dev/wrenn/wrenn/internal/devicemapper"
|
|
"git.omukk.dev/wrenn/wrenn/internal/envdclient"
|
|
"git.omukk.dev/wrenn/wrenn/internal/layout"
|
|
"git.omukk.dev/wrenn/wrenn/internal/models"
|
|
"git.omukk.dev/wrenn/wrenn/internal/network"
|
|
"git.omukk.dev/wrenn/wrenn/internal/snapshot"
|
|
"git.omukk.dev/wrenn/wrenn/internal/vm"
|
|
"git.omukk.dev/wrenn/wrenn/pkg/id"
|
|
envdpb "git.omukk.dev/wrenn/wrenn/proto/envd/gen"
|
|
)
|
|
|
|
// ErrNotFound is returned when a sandbox is not present in the in-memory map.
|
|
var ErrNotFound = errors.New("sandbox not found")
|
|
|
|
// Config holds the paths and defaults for the sandbox manager.
|
|
type Config struct {
|
|
WrennDir string // root directory (e.g. /var/lib/wrenn); all sub-paths derived via layout package
|
|
EnvdTimeout time.Duration
|
|
DefaultRootfsSizeMB int // target size for template rootfs images; 0 → DefaultDiskSizeMB
|
|
|
|
// Resolved at startup by the host agent.
|
|
KernelPath string // path to the latest vmlinux-x.y.z
|
|
KernelVersion string // semver extracted from filename
|
|
VMMBin string // path to the cloud-hypervisor binary
|
|
VMMVersion string // semver from cloud-hypervisor --version
|
|
AgentVersion string // host agent version (injected via ldflags)
|
|
}
|
|
|
|
// LifecycleEvent describes an autonomous state change initiated by the agent.
|
|
type LifecycleEvent struct {
|
|
Event string
|
|
SandboxID string
|
|
}
|
|
|
|
// EventSender sends autonomous lifecycle events to the control plane.
|
|
type EventSender interface {
|
|
SendAsync(event LifecycleEvent)
|
|
}
|
|
|
|
// Manager orchestrates sandbox lifecycle: VM, network, filesystem, envd.
|
|
type Manager struct {
|
|
cfg Config
|
|
vm *vm.Manager
|
|
slots *network.SlotAllocator
|
|
loops *devicemapper.LoopRegistry
|
|
mu sync.RWMutex
|
|
boxes map[string]*sandboxState
|
|
stopCh chan struct{}
|
|
|
|
autoPausedMu sync.Mutex
|
|
autoPausedIDs []string
|
|
|
|
// onDestroy is called with the sandbox ID after cleanup completes.
|
|
// Used by ProxyHandler to evict cached reverse proxies.
|
|
onDestroy func(sandboxID string)
|
|
|
|
// eventSender sends autonomous lifecycle events (auto-pause, auto-destroy)
|
|
// to the CP via HTTP callback. Optional — nil means events are only
|
|
// propagated through the HostMonitor reconciler.
|
|
eventSender EventSender
|
|
}
|
|
|
|
// SetOnDestroy registers a callback invoked after each sandbox is cleaned up.
|
|
func (m *Manager) SetOnDestroy(fn func(sandboxID string)) {
|
|
m.onDestroy = fn
|
|
}
|
|
|
|
// SetEventSender registers the callback sender for autonomous lifecycle events.
|
|
func (m *Manager) SetEventSender(sender EventSender) {
|
|
m.eventSender = sender
|
|
}
|
|
|
|
// sandboxState holds the runtime state for a single sandbox.
|
|
type sandboxState struct {
|
|
models.Sandbox
|
|
lifecycleMu sync.Mutex // serializes Pause/Destroy/Resume on this sandbox
|
|
slot *network.Slot
|
|
client *envdclient.Client
|
|
connTracker *ConnTracker // tracks in-flight proxy connections for pre-pause drain
|
|
dmDevice *devicemapper.SnapshotDevice
|
|
baseImagePath string // path to the base template rootfs (for loop registry release)
|
|
|
|
// Metrics sampling state.
|
|
vmmPID int // VMM process PID (child of unshare wrapper)
|
|
ring *metricsRing // tiered ring buffers for CPU/mem/disk metrics
|
|
samplerCancel context.CancelFunc // cancels the per-sandbox sampling goroutine
|
|
samplerDone chan struct{} // closed when the sampling goroutine exits
|
|
}
|
|
|
|
// buildMetadata constructs the metadata map with version information.
|
|
func (m *Manager) buildMetadata(envdVersion string) map[string]string {
|
|
meta := map[string]string{
|
|
"kernel_version": m.cfg.KernelVersion,
|
|
"vmm_version": m.cfg.VMMVersion,
|
|
"agent_version": m.cfg.AgentVersion,
|
|
}
|
|
if envdVersion != "" {
|
|
meta["envd_version"] = envdVersion
|
|
}
|
|
return meta
|
|
}
|
|
|
|
// resolveKernelPath returns the kernel path for the given version hint.
|
|
// If the exact version exists on disk, it is used. Otherwise, falls back to
|
|
// the latest kernel (m.cfg.KernelPath).
|
|
func (m *Manager) resolveKernelPath(versionHint string) string {
|
|
if versionHint == "" {
|
|
return m.cfg.KernelPath
|
|
}
|
|
exact := layout.KernelPathVersioned(m.cfg.WrennDir, versionHint)
|
|
if _, err := os.Stat(exact); err == nil {
|
|
return exact
|
|
}
|
|
slog.Warn("requested kernel version not found, using latest",
|
|
"requested", versionHint, "latest", m.cfg.KernelVersion)
|
|
return m.cfg.KernelPath
|
|
}
|
|
|
|
// New creates a new sandbox manager.
|
|
func New(cfg Config) *Manager {
|
|
if cfg.EnvdTimeout == 0 {
|
|
cfg.EnvdTimeout = 30 * time.Second
|
|
}
|
|
return &Manager{
|
|
cfg: cfg,
|
|
vm: vm.NewManager(),
|
|
slots: network.NewSlotAllocator(),
|
|
loops: devicemapper.NewLoopRegistry(),
|
|
boxes: make(map[string]*sandboxState),
|
|
stopCh: make(chan struct{}),
|
|
}
|
|
}
|
|
|
|
// Create boots a new sandbox: clone rootfs, set up network, start VM, wait for envd.
|
|
// If sandboxID is empty, a new ID is generated.
|
|
func (m *Manager) Create(ctx context.Context, sandboxID string, teamID, templateID pgtype.UUID, vcpus, memoryMB, timeoutSec, diskSizeMB int) (*models.Sandbox, error) {
|
|
if sandboxID == "" {
|
|
sandboxID = id.FormatSandboxID(id.NewSandboxID())
|
|
}
|
|
|
|
if vcpus <= 0 {
|
|
vcpus = 1
|
|
}
|
|
if memoryMB <= 0 {
|
|
memoryMB = 512
|
|
}
|
|
if diskSizeMB <= 0 {
|
|
diskSizeMB = 5120 // 5 GB default
|
|
}
|
|
|
|
// Check if template refers to a CH snapshot (has config.json).
|
|
tmplDir := layout.TemplateDir(m.cfg.WrennDir, teamID, templateID)
|
|
if _, err := os.Stat(filepath.Join(tmplDir, snapshot.CHConfigFile)); err == nil {
|
|
return m.createFromSnapshot(ctx, sandboxID, teamID, templateID, vcpus, memoryMB, timeoutSec, diskSizeMB)
|
|
}
|
|
|
|
// Resolve base rootfs image.
|
|
baseRootfs := layout.TemplateRootfs(m.cfg.WrennDir, teamID, templateID)
|
|
if _, err := os.Stat(baseRootfs); err != nil {
|
|
return nil, fmt.Errorf("base rootfs not found at %s: %w", baseRootfs, err)
|
|
}
|
|
|
|
// Acquire shared read-only loop device for the base image.
|
|
originLoop, err := m.loops.Acquire(baseRootfs)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("acquire loop device: %w", err)
|
|
}
|
|
|
|
originSize, err := devicemapper.OriginSizeBytes(originLoop)
|
|
if err != nil {
|
|
m.loops.Release(baseRootfs)
|
|
return nil, fmt.Errorf("get origin size: %w", err)
|
|
}
|
|
|
|
// Create dm-snapshot with per-sandbox CoW file.
|
|
// CoW must be at least as large as the origin — if every block is
|
|
// rewritten, the CoW stores a full copy. Undersized CoW causes
|
|
// dm-snapshot invalidation → EIO on all guest I/O.
|
|
dmName := "wrenn-" + sandboxID
|
|
cowPath := filepath.Join(layout.SandboxesDir(m.cfg.WrennDir), fmt.Sprintf("%s.cow", sandboxID))
|
|
cowSize := max(int64(diskSizeMB)*1024*1024, originSize)
|
|
dmDev, err := devicemapper.CreateSnapshot(dmName, originLoop, cowPath, originSize, cowSize)
|
|
if err != nil {
|
|
m.loops.Release(baseRootfs)
|
|
return nil, fmt.Errorf("create dm-snapshot: %w", err)
|
|
}
|
|
|
|
// Allocate network slot.
|
|
slotIdx, err := m.slots.Allocate()
|
|
if err != nil {
|
|
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev))
|
|
os.Remove(cowPath)
|
|
m.loops.Release(baseRootfs)
|
|
return nil, fmt.Errorf("allocate network slot: %w", err)
|
|
}
|
|
slot := network.NewSlot(slotIdx)
|
|
|
|
// Set up network.
|
|
if err := network.CreateNetwork(slot); err != nil {
|
|
m.slots.Release(slotIdx)
|
|
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev))
|
|
os.Remove(cowPath)
|
|
m.loops.Release(baseRootfs)
|
|
return nil, fmt.Errorf("create network: %w", err)
|
|
}
|
|
|
|
// Boot VM — CH gets the dm device path.
|
|
vmCfg := vm.VMConfig{
|
|
SandboxID: sandboxID,
|
|
TemplateID: id.UUIDString(templateID),
|
|
KernelPath: m.cfg.KernelPath,
|
|
RootfsPath: dmDev.DevicePath,
|
|
VCPUs: vcpus,
|
|
MemoryMB: memoryMB,
|
|
NetworkNamespace: slot.NamespaceID,
|
|
TapDevice: slot.TapName,
|
|
TapMAC: slot.TapMAC,
|
|
GuestIP: slot.GuestIP,
|
|
GatewayIP: slot.TapIP,
|
|
NetMask: slot.GuestNetMask,
|
|
VMMBin: m.cfg.VMMBin,
|
|
}
|
|
|
|
if _, err := m.vm.Create(ctx, vmCfg); err != nil {
|
|
warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot))
|
|
m.slots.Release(slotIdx)
|
|
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev))
|
|
os.Remove(cowPath)
|
|
m.loops.Release(baseRootfs)
|
|
return nil, fmt.Errorf("create VM: %w", err)
|
|
}
|
|
|
|
// Wait for envd to be ready.
|
|
client := envdclient.New(slot.HostIP.String())
|
|
waitCtx, waitCancel := context.WithTimeout(ctx, m.cfg.EnvdTimeout)
|
|
defer waitCancel()
|
|
|
|
if err := client.WaitUntilReady(waitCtx); err != nil {
|
|
warnErr("vm destroy error", sandboxID, m.vm.Destroy(context.Background(), sandboxID))
|
|
warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot))
|
|
m.slots.Release(slotIdx)
|
|
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev))
|
|
os.Remove(cowPath)
|
|
m.loops.Release(baseRootfs)
|
|
return nil, fmt.Errorf("wait for envd: %w", err)
|
|
}
|
|
|
|
// Fetch envd version (best-effort).
|
|
envdVersion, _ := client.FetchVersion(ctx)
|
|
|
|
now := time.Now()
|
|
sb := &sandboxState{
|
|
Sandbox: models.Sandbox{
|
|
ID: sandboxID,
|
|
Status: models.StatusRunning,
|
|
TemplateTeamID: teamID.Bytes,
|
|
TemplateID: templateID.Bytes,
|
|
VCPUs: vcpus,
|
|
MemoryMB: memoryMB,
|
|
TimeoutSec: timeoutSec,
|
|
SlotIndex: slotIdx,
|
|
HostIP: slot.HostIP,
|
|
RootfsPath: dmDev.DevicePath,
|
|
CreatedAt: now,
|
|
LastActiveAt: now,
|
|
Metadata: m.buildMetadata(envdVersion),
|
|
},
|
|
slot: slot,
|
|
client: client,
|
|
connTracker: &ConnTracker{},
|
|
dmDevice: dmDev,
|
|
baseImagePath: baseRootfs,
|
|
}
|
|
|
|
m.mu.Lock()
|
|
m.boxes[sandboxID] = sb
|
|
m.mu.Unlock()
|
|
|
|
m.startSampler(sb)
|
|
m.startCrashWatcher(sb)
|
|
|
|
slog.Info("sandbox created",
|
|
"id", sandboxID,
|
|
"team_id", teamID,
|
|
"template_id", templateID,
|
|
"host_ip", slot.HostIP.String(),
|
|
"dm_device", dmDev.DevicePath,
|
|
)
|
|
|
|
return &sb.Sandbox, nil
|
|
}
|
|
|
|
// Destroy stops and cleans up a sandbox. If the sandbox is running, its VM,
|
|
// network, and rootfs are torn down. Any pause snapshot files are also removed.
|
|
func (m *Manager) Destroy(ctx context.Context, sandboxID string) error {
|
|
m.mu.Lock()
|
|
sb, ok := m.boxes[sandboxID]
|
|
if ok {
|
|
delete(m.boxes, sandboxID)
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
if ok {
|
|
// Wait for any in-progress Pause to finish before tearing down resources.
|
|
sb.lifecycleMu.Lock()
|
|
defer sb.lifecycleMu.Unlock()
|
|
m.cleanup(ctx, sb)
|
|
}
|
|
|
|
// Always clean up pause snapshot files (may exist if sandbox was paused).
|
|
if err := os.RemoveAll(layout.PauseSnapshotDir(m.cfg.WrennDir, sandboxID)); err != nil {
|
|
slog.Warn("snapshot cleanup error", "id", sandboxID, "error", err)
|
|
}
|
|
|
|
if m.onDestroy != nil {
|
|
m.onDestroy(sandboxID)
|
|
}
|
|
|
|
slog.Info("sandbox destroyed", "id", sandboxID)
|
|
return nil
|
|
}
|
|
|
|
// cleanup tears down all resources for a sandbox.
|
|
func (m *Manager) cleanup(ctx context.Context, sb *sandboxState) {
|
|
m.stopSampler(sb)
|
|
if err := m.vm.Destroy(ctx, sb.ID); err != nil {
|
|
slog.Warn("vm destroy error", "id", sb.ID, "error", err)
|
|
}
|
|
if err := network.RemoveNetwork(sb.slot); err != nil {
|
|
slog.Warn("network cleanup error", "id", sb.ID, "error", err)
|
|
}
|
|
m.slots.Release(sb.SlotIndex)
|
|
|
|
// Tear down dm-snapshot and release the base image loop device.
|
|
if sb.dmDevice != nil {
|
|
if err := devicemapper.RemoveSnapshot(context.Background(), sb.dmDevice); err != nil {
|
|
slog.Warn("dm-snapshot remove error", "id", sb.ID, "error", err)
|
|
}
|
|
os.Remove(sb.dmDevice.CowPath)
|
|
}
|
|
if sb.baseImagePath != "" {
|
|
m.loops.Release(sb.baseImagePath)
|
|
}
|
|
}
|
|
|
|
// cleanupPauseFailure is best-effort teardown when a pause operation fails
|
|
// after the VM has already been destroyed. It releases all resources and removes
|
|
// the sandbox from the in-memory map.
|
|
func (m *Manager) cleanupPauseFailure(sb *sandboxState, sandboxID string, pauseDir string) {
|
|
warnErr("snapshot dir cleanup error", sandboxID, os.RemoveAll(pauseDir))
|
|
warnErr("network cleanup error during pause", sandboxID, network.RemoveNetwork(sb.slot))
|
|
m.slots.Release(sb.SlotIndex)
|
|
if sb.dmDevice != nil {
|
|
warnErr("dm-snapshot remove error during pause", sandboxID, devicemapper.RemoveSnapshot(context.Background(), sb.dmDevice))
|
|
os.Remove(sb.dmDevice.CowPath)
|
|
}
|
|
if sb.baseImagePath != "" {
|
|
m.loops.Release(sb.baseImagePath)
|
|
}
|
|
m.mu.Lock()
|
|
delete(m.boxes, sandboxID)
|
|
m.mu.Unlock()
|
|
}
|
|
|
|
// Pause takes a snapshot of a running sandbox, then destroys all resources.
|
|
// The sandbox's snapshot files are stored at SnapshotsDir/{sandboxID}/.
|
|
// After this call, the sandbox is no longer running but can be resumed.
|
|
func (m *Manager) Pause(ctx context.Context, sandboxID string) error {
|
|
sb, err := m.get(sandboxID)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// Serialize lifecycle operations on this sandbox to prevent concurrent
|
|
// Pause/Destroy calls from corrupting VM state.
|
|
sb.lifecycleMu.Lock()
|
|
defer sb.lifecycleMu.Unlock()
|
|
|
|
if sb.Status != models.StatusRunning {
|
|
return fmt.Errorf("sandbox %s is not running (status: %s)", sandboxID, sb.Status)
|
|
}
|
|
|
|
// Mark sandbox as pausing to block new exec/file/PTY operations.
|
|
m.mu.Lock()
|
|
sb.Status = models.StatusPausing
|
|
m.mu.Unlock()
|
|
|
|
// restoreRunning reverts state if any pre-freeze step fails.
|
|
restoreRunning := func() {
|
|
_ = m.vm.UpdateBalloon(context.Background(), sandboxID, 0)
|
|
sb.connTracker.Reset()
|
|
m.mu.Lock()
|
|
sb.Status = models.StatusRunning
|
|
m.mu.Unlock()
|
|
m.startSampler(sb)
|
|
}
|
|
|
|
// Stop the metrics sampler goroutine before tearing down any resources
|
|
// it reads (dm device, VMM PID). Without this, the sampler
|
|
// leaks on every successful pause.
|
|
m.stopSampler(sb)
|
|
|
|
// ── Step 1: Isolate from external traffic ─────────────────────────
|
|
// Drain in-flight proxy connections (grace period for clean shutdown).
|
|
sb.connTracker.Drain(5 * time.Second)
|
|
// Force-close any connections that didn't finish during grace period.
|
|
sb.connTracker.ForceClose()
|
|
slog.Debug("pause: external connections closed", "id", sandboxID)
|
|
|
|
// Close host-side idle connections to envd so FIN packets propagate
|
|
// to the guest kernel before snapshot.
|
|
sb.client.CloseIdleConnections()
|
|
|
|
// ── Step 2: Drop page cache ──────────────────────────────────────
|
|
// Signal envd to quiesce: drops page cache, stops port subsystem,
|
|
// marks connections for post-restore cleanup. Page cache drop can
|
|
// take significant time on large-memory VMs (20GB+).
|
|
func() {
|
|
prepCtx, prepCancel := context.WithTimeout(ctx, 30*time.Second)
|
|
defer prepCancel()
|
|
if err := sb.client.PrepareSnapshot(prepCtx); err != nil {
|
|
slog.Warn("pause: pre-snapshot quiesce failed (best-effort)", "id", sandboxID, "error", err)
|
|
} else {
|
|
slog.Debug("pause: envd quiesced", "id", sandboxID)
|
|
}
|
|
}()
|
|
|
|
// ── Step 3: Inflate balloon to reclaim free guest memory ─────────
|
|
// Freed pages become zero in the snapshot's memory-ranges file.
|
|
// CH v52+ writes sparse snapshots natively (SEEK_DATA/SEEK_HOLE).
|
|
func() {
|
|
memUsed, err := readEnvdMemUsed(ctx, sb.client)
|
|
if err != nil {
|
|
slog.Debug("pause: could not read guest memory, skipping balloon inflate", "id", sandboxID, "error", err)
|
|
return
|
|
}
|
|
usedMiB := int(memUsed / (1024 * 1024))
|
|
keepMiB := max(usedMiB*3/2, 512)
|
|
inflateMiB := sb.MemoryMB - keepMiB
|
|
if inflateMiB <= 0 {
|
|
slog.Debug("pause: not enough free memory for balloon inflate", "id", sandboxID, "used_mib", usedMiB, "total_mib", sb.MemoryMB)
|
|
return
|
|
}
|
|
balloonCtx, balloonCancel := context.WithTimeout(ctx, 10*time.Second)
|
|
defer balloonCancel()
|
|
if err := m.vm.UpdateBalloon(balloonCtx, sandboxID, inflateMiB); err != nil {
|
|
slog.Debug("pause: balloon inflate failed (non-fatal)", "id", sandboxID, "error", err)
|
|
return
|
|
}
|
|
time.Sleep(2 * time.Second)
|
|
slog.Info("pause: balloon inflated", "id", sandboxID, "inflate_mib", inflateMiB, "guest_used_mib", usedMiB)
|
|
}()
|
|
|
|
// ── Step 4: Freeze vCPUs ─────────────────────────────────────────
|
|
pauseStart := time.Now()
|
|
|
|
if err := m.vm.Pause(ctx, sandboxID); err != nil {
|
|
restoreRunning()
|
|
return fmt.Errorf("pause VM: %w", err)
|
|
}
|
|
slog.Debug("pause: VM paused", "id", sandboxID, "elapsed", time.Since(pauseStart))
|
|
|
|
// resumeOnError unpauses the VM so the sandbox stays usable when a
|
|
// post-freeze step fails. If the resume itself fails, the sandbox is
|
|
// frozen and unrecoverable — destroy it to avoid a zombie.
|
|
resumeOnError := func() {
|
|
resumeCtx, resumeCancel := context.WithTimeout(context.Background(), 30*time.Second)
|
|
defer resumeCancel()
|
|
if err := m.vm.Resume(resumeCtx, sandboxID); err != nil {
|
|
slog.Error("failed to resume VM after pause error — destroying frozen sandbox", "id", sandboxID, "error", err)
|
|
m.cleanup(context.Background(), sb)
|
|
m.mu.Lock()
|
|
delete(m.boxes, sandboxID)
|
|
m.mu.Unlock()
|
|
if m.onDestroy != nil {
|
|
m.onDestroy(sandboxID)
|
|
}
|
|
return
|
|
}
|
|
restoreRunning()
|
|
}
|
|
|
|
// ── Step 5: Take CH snapshot ─────────────────────────────────────
|
|
// Snapshot to a temp dir first. If the sandbox was previously resumed,
|
|
// the old pauseDir still contains memory-ranges that CH's uffd handler
|
|
// is lazily paging from. CH refuses to overwrite existing files (EEXIST),
|
|
// so we write to a fresh dir and swap after the VM is destroyed.
|
|
pauseDir := layout.PauseSnapshotDir(m.cfg.WrennDir, sandboxID)
|
|
tmpPauseDir := pauseDir + ".new"
|
|
if err := os.RemoveAll(tmpPauseDir); err != nil {
|
|
resumeOnError()
|
|
return fmt.Errorf("clean temp snapshot dir: %w", err)
|
|
}
|
|
if err := os.MkdirAll(tmpPauseDir, 0755); err != nil {
|
|
resumeOnError()
|
|
return fmt.Errorf("create snapshot dir: %w", err)
|
|
}
|
|
|
|
snapshotStart := time.Now()
|
|
if err := m.vm.Snapshot(ctx, sandboxID, tmpPauseDir); err != nil {
|
|
slog.Error("pause: snapshot failed", "id", sandboxID, "elapsed", time.Since(snapshotStart), "error", err)
|
|
warnErr("snapshot dir cleanup error", sandboxID, os.RemoveAll(tmpPauseDir))
|
|
resumeOnError()
|
|
return fmt.Errorf("create VM snapshot: %w", err)
|
|
}
|
|
slog.Debug("pause: CH snapshot created", "id", sandboxID, "elapsed", time.Since(snapshotStart))
|
|
|
|
// ── Step 6: Destroy the VM so CH releases the dm device ──────────
|
|
if err := m.vm.Destroy(ctx, sb.ID); err != nil {
|
|
slog.Warn("vm destroy error during pause", "id", sb.ID, "error", err)
|
|
}
|
|
|
|
// CH process is dead — uffd handler no longer reads old memory-ranges.
|
|
// Replace old snapshot dir with new one.
|
|
if err := os.RemoveAll(pauseDir); err != nil {
|
|
slog.Warn("pause: failed to remove old snapshot dir", "id", sandboxID, "error", err)
|
|
}
|
|
if err := os.Rename(tmpPauseDir, pauseDir); err != nil {
|
|
m.cleanupPauseFailure(sb, sandboxID, pauseDir)
|
|
return fmt.Errorf("rename snapshot dir: %w", err)
|
|
}
|
|
|
|
// ── Step 7: Remove dm-snapshot and save CoW ──────────────────────
|
|
if sb.dmDevice != nil {
|
|
if err := devicemapper.RemoveSnapshot(ctx, sb.dmDevice); err != nil {
|
|
m.cleanupPauseFailure(sb, sandboxID, pauseDir)
|
|
return fmt.Errorf("remove dm-snapshot: %w", err)
|
|
}
|
|
|
|
snapshotCow := snapshot.CowPath(pauseDir, "")
|
|
if err := os.Rename(sb.dmDevice.CowPath, snapshotCow); err != nil {
|
|
os.Remove(sb.dmDevice.CowPath)
|
|
m.cleanupPauseFailure(sb, sandboxID, pauseDir)
|
|
return fmt.Errorf("move cow file: %w", err)
|
|
}
|
|
|
|
if err := snapshot.WriteMeta(pauseDir, "", &snapshot.RootfsMeta{
|
|
BaseTemplate: sb.baseImagePath,
|
|
TemplateID: id.UUIDString(pgtype.UUID{Bytes: sb.TemplateID, Valid: true}),
|
|
VCPUs: sb.VCPUs,
|
|
MemoryMB: sb.MemoryMB,
|
|
}); err != nil {
|
|
m.cleanupPauseFailure(sb, sandboxID, pauseDir)
|
|
return fmt.Errorf("write rootfs meta: %w", err)
|
|
}
|
|
}
|
|
|
|
// ── Step 8: Clean up remaining resources ─────────────────────────
|
|
if err := network.RemoveNetwork(sb.slot); err != nil {
|
|
slog.Warn("network cleanup error during pause", "id", sb.ID, "error", err)
|
|
}
|
|
m.slots.Release(sb.SlotIndex)
|
|
if sb.baseImagePath != "" {
|
|
m.loops.Release(sb.baseImagePath)
|
|
}
|
|
|
|
m.mu.Lock()
|
|
delete(m.boxes, sandboxID)
|
|
m.mu.Unlock()
|
|
|
|
slog.Info("sandbox paused", "id", sandboxID, "total_elapsed", time.Since(pauseStart))
|
|
return nil
|
|
}
|
|
|
|
// Resume restores a paused sandbox from its CH snapshot.
|
|
// CH handles memory restore internally (with on-demand paging).
|
|
// The sandbox gets a new network slot.
|
|
// Optional defaultUser and defaultEnv are applied via PostInit with
|
|
// sandbox_id and template_id so envd picks up the new sandbox's metadata.
|
|
func (m *Manager) Resume(ctx context.Context, sandboxID string, timeoutSec int, kernelVersion string, defaultUser string, defaultEnv map[string]string) (*models.Sandbox, error) {
|
|
pauseDir := layout.PauseSnapshotDir(m.cfg.WrennDir, sandboxID)
|
|
if _, err := os.Stat(pauseDir); err != nil {
|
|
return nil, fmt.Errorf("no snapshot found for sandbox %s", sandboxID)
|
|
}
|
|
|
|
// Read rootfs metadata to find the base template image.
|
|
meta, err := snapshot.ReadMeta(pauseDir, "")
|
|
if err != nil {
|
|
return nil, fmt.Errorf("read rootfs meta: %w", err)
|
|
}
|
|
|
|
// Acquire the base image loop device and restore dm-snapshot from saved CoW.
|
|
baseImagePath := meta.BaseTemplate
|
|
originLoop, err := m.loops.Acquire(baseImagePath)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("acquire loop device: %w", err)
|
|
}
|
|
|
|
originSize, err := devicemapper.OriginSizeBytes(originLoop)
|
|
if err != nil {
|
|
m.loops.Release(baseImagePath)
|
|
return nil, fmt.Errorf("get origin size: %w", err)
|
|
}
|
|
|
|
// Move CoW file from snapshot dir to sandboxes dir for the running sandbox.
|
|
savedCow := snapshot.CowPath(pauseDir, "")
|
|
cowPath := filepath.Join(layout.SandboxesDir(m.cfg.WrennDir), fmt.Sprintf("%s.cow", sandboxID))
|
|
if err := os.Rename(savedCow, cowPath); err != nil {
|
|
m.loops.Release(baseImagePath)
|
|
return nil, fmt.Errorf("move cow file: %w", err)
|
|
}
|
|
|
|
rollbackCow := func() {
|
|
if err := os.Rename(cowPath, savedCow); err != nil {
|
|
slog.Warn("failed to rollback cow file", "src", cowPath, "dst", savedCow, "error", err)
|
|
}
|
|
}
|
|
|
|
// Restore dm-snapshot from existing persistent CoW file.
|
|
dmName := "wrenn-" + sandboxID
|
|
dmDev, err := devicemapper.RestoreSnapshot(ctx, dmName, originLoop, cowPath, originSize)
|
|
if err != nil {
|
|
m.loops.Release(baseImagePath)
|
|
rollbackCow()
|
|
return nil, fmt.Errorf("restore dm-snapshot: %w", err)
|
|
}
|
|
|
|
// Allocate network slot.
|
|
slotIdx, err := m.slots.Allocate()
|
|
if err != nil {
|
|
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev))
|
|
rollbackCow()
|
|
m.loops.Release(baseImagePath)
|
|
return nil, fmt.Errorf("allocate network slot: %w", err)
|
|
}
|
|
slot := network.NewSlot(slotIdx)
|
|
|
|
if err := network.CreateNetwork(slot); err != nil {
|
|
m.slots.Release(slotIdx)
|
|
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev))
|
|
rollbackCow()
|
|
m.loops.Release(baseImagePath)
|
|
return nil, fmt.Errorf("create network: %w", err)
|
|
}
|
|
|
|
// Restore VM from CH snapshot.
|
|
vmCfg := vm.VMConfig{
|
|
SandboxID: sandboxID,
|
|
TemplateID: meta.TemplateID,
|
|
KernelPath: m.resolveKernelPath(kernelVersion),
|
|
RootfsPath: dmDev.DevicePath,
|
|
NetworkNamespace: slot.NamespaceID,
|
|
TapDevice: slot.TapName,
|
|
TapMAC: slot.TapMAC,
|
|
GuestIP: slot.GuestIP,
|
|
GatewayIP: slot.TapIP,
|
|
NetMask: slot.GuestNetMask,
|
|
VMMBin: m.cfg.VMMBin,
|
|
}
|
|
|
|
if _, err := m.vm.CreateFromSnapshot(ctx, vmCfg, pauseDir); err != nil {
|
|
warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot))
|
|
m.slots.Release(slotIdx)
|
|
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev))
|
|
rollbackCow()
|
|
m.loops.Release(baseImagePath)
|
|
return nil, fmt.Errorf("restore VM from snapshot: %w", err)
|
|
}
|
|
|
|
// Wait for envd to be ready.
|
|
client := envdclient.New(slot.HostIP.String())
|
|
waitCtx, waitCancel := context.WithTimeout(ctx, m.cfg.EnvdTimeout)
|
|
|
|
if err := client.WaitUntilReady(waitCtx); err != nil {
|
|
waitCancel()
|
|
warnErr("vm destroy error", sandboxID, m.vm.Destroy(context.Background(), sandboxID))
|
|
warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot))
|
|
m.slots.Release(slotIdx)
|
|
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev))
|
|
rollbackCow()
|
|
m.loops.Release(baseImagePath)
|
|
return nil, fmt.Errorf("wait for envd: %w", err)
|
|
}
|
|
waitCancel()
|
|
|
|
// PostInit with sandbox_id and template_id so envd sets metadata env vars.
|
|
// Fire-and-forget: post-init is non-critical (metadata/env vars), and
|
|
// blocking here widens the window for the CP monitor to race against us.
|
|
go func() {
|
|
initCtx, initCancel := context.WithTimeout(context.Background(), 10*time.Second)
|
|
defer initCancel()
|
|
if err := client.PostInitWithDefaults(initCtx, defaultUser, defaultEnv, sandboxID, meta.TemplateID); err != nil {
|
|
slog.Warn("post-init failed after resume, metadata may be stale", "sandbox", sandboxID, "error", err)
|
|
}
|
|
}()
|
|
|
|
// Deflate balloon — the snapshot was taken with an inflated balloon.
|
|
if err := m.vm.UpdateBalloon(ctx, sandboxID, 0); err != nil {
|
|
slog.Debug("resume: balloon deflate failed (non-fatal)", "id", sandboxID, "error", err)
|
|
}
|
|
|
|
// Wait for balloon deflation to settle. With OnDemand UFFD restore,
|
|
// the guest is under severe memory pressure while the balloon is still
|
|
// inflated. If the sampler or memory reclaimer runs during this window,
|
|
// the resulting page fault storm can make the guest kernel unresponsive.
|
|
if meta.MemoryMB > 0 {
|
|
threshold := int64(float64(meta.MemoryMB) * 0.80 * 1024 * 1024)
|
|
settleCtx, settleCancel := context.WithTimeout(ctx, 10*time.Second)
|
|
ticker := time.NewTicker(500 * time.Millisecond)
|
|
settled := false
|
|
for !settled {
|
|
select {
|
|
case <-settleCtx.Done():
|
|
slog.Warn("resume: balloon deflation did not settle in time", "id", sandboxID)
|
|
settled = true
|
|
case <-ticker.C:
|
|
used, err := readEnvdMemUsed(settleCtx, client)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
if used < threshold {
|
|
slog.Info("resume: balloon deflation settled", "id", sandboxID, "used_mib", used/(1024*1024))
|
|
settled = true
|
|
}
|
|
}
|
|
}
|
|
ticker.Stop()
|
|
settleCancel()
|
|
}
|
|
|
|
// Fetch envd version (best-effort).
|
|
envdVersion, _ := client.FetchVersion(ctx)
|
|
|
|
vcpus := meta.VCPUs
|
|
if vcpus <= 0 {
|
|
vcpus = 1
|
|
}
|
|
memoryMB := meta.MemoryMB
|
|
if memoryMB <= 0 {
|
|
memoryMB = 512
|
|
}
|
|
|
|
now := time.Now()
|
|
sb := &sandboxState{
|
|
Sandbox: models.Sandbox{
|
|
ID: sandboxID,
|
|
Status: models.StatusRunning,
|
|
VCPUs: vcpus,
|
|
MemoryMB: memoryMB,
|
|
TimeoutSec: timeoutSec,
|
|
SlotIndex: slotIdx,
|
|
HostIP: slot.HostIP,
|
|
RootfsPath: dmDev.DevicePath,
|
|
CreatedAt: now,
|
|
LastActiveAt: now,
|
|
Metadata: m.buildMetadata(envdVersion),
|
|
},
|
|
slot: slot,
|
|
client: client,
|
|
connTracker: &ConnTracker{},
|
|
dmDevice: dmDev,
|
|
baseImagePath: baseImagePath,
|
|
}
|
|
|
|
m.mu.Lock()
|
|
m.boxes[sandboxID] = sb
|
|
m.mu.Unlock()
|
|
|
|
m.startSampler(sb)
|
|
m.startCrashWatcher(sb)
|
|
|
|
slog.Info("sandbox resumed from snapshot",
|
|
"id", sandboxID,
|
|
"host_ip", slot.HostIP.String(),
|
|
"dm_device", dmDev.DevicePath,
|
|
"vcpus", vcpus,
|
|
)
|
|
|
|
return &sb.Sandbox, nil
|
|
}
|
|
|
|
// CreateSnapshot creates a reusable template from a sandbox. Works on both
|
|
// running and paused sandboxes. If the sandbox is running, it is paused first.
|
|
// The sandbox remains paused after this call (it can still be resumed).
|
|
//
|
|
// The rootfs is flattened (base + CoW merged) into a new standalone rootfs.ext4
|
|
// so the template has no dependency on the original base image. Memory state
|
|
// and VM snapshot files are copied as-is.
|
|
func (m *Manager) CreateSnapshot(ctx context.Context, sandboxID string, teamID, templateID pgtype.UUID) (int64, error) {
|
|
// If the sandbox is running, pause it first.
|
|
if _, err := m.get(sandboxID); err == nil {
|
|
if err := m.Pause(ctx, sandboxID); err != nil {
|
|
return 0, fmt.Errorf("pause sandbox: %w", err)
|
|
}
|
|
}
|
|
|
|
// At this point, pause snapshot files must exist.
|
|
pauseDir := layout.PauseSnapshotDir(m.cfg.WrennDir, sandboxID)
|
|
if _, err := os.Stat(pauseDir); err != nil {
|
|
return 0, fmt.Errorf("no snapshot found for sandbox %s", sandboxID)
|
|
}
|
|
|
|
// Create template directory.
|
|
dstDir := layout.TemplateDir(m.cfg.WrennDir, teamID, templateID)
|
|
if err := os.MkdirAll(dstDir, 0755); err != nil {
|
|
return 0, fmt.Errorf("create template dir: %w", err)
|
|
}
|
|
|
|
// Copy CH snapshot files (config.json, memory-ranges, state.json).
|
|
for _, fname := range []string{snapshot.CHConfigFile, snapshot.CHMemRangesFile, snapshot.CHStateFile} {
|
|
src := filepath.Join(pauseDir, fname)
|
|
dst := filepath.Join(dstDir, fname)
|
|
if _, err := os.Stat(src); err != nil {
|
|
continue // some files may not exist
|
|
}
|
|
if err := copyFile(src, dst); err != nil {
|
|
warnErr("template dir cleanup error", dstDir, os.RemoveAll(dstDir))
|
|
return 0, fmt.Errorf("copy %s: %w", fname, err)
|
|
}
|
|
}
|
|
|
|
// Flatten rootfs: temporarily set up dm device from base + CoW, dd to new image.
|
|
meta, err := snapshot.ReadMeta(pauseDir, "")
|
|
if err != nil {
|
|
warnErr("template dir cleanup error", dstDir, os.RemoveAll(dstDir))
|
|
return 0, fmt.Errorf("read rootfs meta: %w", err)
|
|
}
|
|
|
|
originLoop, err := m.loops.Acquire(meta.BaseTemplate)
|
|
if err != nil {
|
|
warnErr("template dir cleanup error", dstDir, os.RemoveAll(dstDir))
|
|
return 0, fmt.Errorf("acquire loop device for flatten: %w", err)
|
|
}
|
|
|
|
originSize, err := devicemapper.OriginSizeBytes(originLoop)
|
|
if err != nil {
|
|
m.loops.Release(meta.BaseTemplate)
|
|
warnErr("template dir cleanup error", dstDir, os.RemoveAll(dstDir))
|
|
return 0, fmt.Errorf("get origin size: %w", err)
|
|
}
|
|
|
|
// Temporarily restore the dm-snapshot to read the merged view.
|
|
cowPath := snapshot.CowPath(pauseDir, "")
|
|
tmpDmName := "wrenn-flatten-" + sandboxID
|
|
tmpDev, err := devicemapper.RestoreSnapshot(ctx, tmpDmName, originLoop, cowPath, originSize)
|
|
if err != nil {
|
|
m.loops.Release(meta.BaseTemplate)
|
|
warnErr("template dir cleanup error", dstDir, os.RemoveAll(dstDir))
|
|
return 0, fmt.Errorf("restore dm-snapshot for flatten: %w", err)
|
|
}
|
|
|
|
// Flatten to new standalone rootfs.
|
|
flattenedPath := filepath.Join(dstDir, snapshot.RootfsFileName)
|
|
flattenErr := devicemapper.FlattenSnapshot(tmpDev.DevicePath, flattenedPath)
|
|
|
|
// Always clean up the temporary dm device.
|
|
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), tmpDev))
|
|
m.loops.Release(meta.BaseTemplate)
|
|
|
|
if flattenErr != nil {
|
|
warnErr("template dir cleanup error", dstDir, os.RemoveAll(dstDir))
|
|
return 0, fmt.Errorf("flatten rootfs: %w", flattenErr)
|
|
}
|
|
|
|
sizeBytes, err := snapshot.DirSize(dstDir, "")
|
|
if err != nil {
|
|
slog.Warn("failed to calculate snapshot size", "error", err)
|
|
}
|
|
|
|
slog.Info("template snapshot created (rootfs flattened)",
|
|
"sandbox", sandboxID,
|
|
"team_id", teamID,
|
|
"template_id", templateID,
|
|
"size_bytes", sizeBytes,
|
|
)
|
|
return sizeBytes, nil
|
|
}
|
|
|
|
// FlattenRootfs stops a running sandbox, flattens its device-mapper CoW
|
|
// rootfs into a standalone rootfs.ext4, and cleans up all resources.
|
|
// The result is an image-only template (no VM memory/CPU state) stored in
|
|
// ImagesDir/{name}/rootfs.ext4.
|
|
func (m *Manager) FlattenRootfs(ctx context.Context, sandboxID string, teamID, templateID pgtype.UUID) (int64, error) {
|
|
m.mu.Lock()
|
|
sb, ok := m.boxes[sandboxID]
|
|
if ok {
|
|
delete(m.boxes, sandboxID)
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
if !ok {
|
|
return 0, fmt.Errorf("%w: %s", ErrNotFound, sandboxID)
|
|
}
|
|
|
|
// Flush guest page cache to disk before stopping the VM. Without this,
|
|
// files written by the build (e.g. pip-installed packages) may exist in the
|
|
// guest's page cache but not yet on the dm block device — flatten would then
|
|
// capture 0-byte files.
|
|
func() {
|
|
syncCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
|
|
defer cancel()
|
|
if _, err := sb.client.Exec(syncCtx, "/bin/sync"); err != nil {
|
|
slog.Warn("flatten: guest sync failed (non-fatal)", "id", sb.ID, "error", err)
|
|
}
|
|
}()
|
|
|
|
// Stop the VM but keep the dm device alive for flattening.
|
|
m.stopSampler(sb)
|
|
if err := m.vm.Destroy(ctx, sb.ID); err != nil {
|
|
slog.Warn("vm destroy error during flatten", "id", sb.ID, "error", err)
|
|
}
|
|
|
|
// Release network resources — not needed after VM is stopped.
|
|
if err := network.RemoveNetwork(sb.slot); err != nil {
|
|
slog.Warn("network cleanup error during flatten", "id", sb.ID, "error", err)
|
|
}
|
|
m.slots.Release(sb.SlotIndex)
|
|
|
|
// Create template directory and flatten the dm-snapshot.
|
|
flattenDstDir := layout.TemplateDir(m.cfg.WrennDir, teamID, templateID)
|
|
if err := os.MkdirAll(flattenDstDir, 0755); err != nil {
|
|
m.cleanupDM(sb)
|
|
return 0, fmt.Errorf("create template dir: %w", err)
|
|
}
|
|
|
|
outputPath := filepath.Join(flattenDstDir, snapshot.RootfsFileName)
|
|
if sb.dmDevice == nil {
|
|
m.cleanupDM(sb)
|
|
warnErr("template dir cleanup error", flattenDstDir, os.RemoveAll(flattenDstDir))
|
|
return 0, fmt.Errorf("sandbox %s has no dm device", sandboxID)
|
|
}
|
|
|
|
if err := devicemapper.FlattenSnapshot(sb.dmDevice.DevicePath, outputPath); err != nil {
|
|
m.cleanupDM(sb)
|
|
warnErr("template dir cleanup error", flattenDstDir, os.RemoveAll(flattenDstDir))
|
|
return 0, fmt.Errorf("flatten rootfs: %w", err)
|
|
}
|
|
|
|
// Clean up dm device and loop device now that flatten is complete.
|
|
m.cleanupDM(sb)
|
|
|
|
// Shrink the flattened image to its minimum size, then re-expand to the
|
|
// configured default rootfs size so sandboxes see the full disk from boot.
|
|
if out, err := exec.Command("e2fsck", "-fy", outputPath).CombinedOutput(); err != nil {
|
|
if exitErr, ok := err.(*exec.ExitError); ok && exitErr.ExitCode() > 1 {
|
|
slog.Warn("e2fsck before shrink failed (non-fatal)", "output", string(out), "error", err)
|
|
}
|
|
}
|
|
if out, err := exec.Command("resize2fs", "-M", outputPath).CombinedOutput(); err != nil {
|
|
slog.Warn("resize2fs -M failed (non-fatal)", "output", string(out), "error", err)
|
|
}
|
|
|
|
// Re-expand to default rootfs size.
|
|
targetMB := m.cfg.DefaultRootfsSizeMB
|
|
if targetMB <= 0 {
|
|
targetMB = DefaultDiskSizeMB
|
|
}
|
|
if err := expandImage(outputPath, int64(targetMB)*1024*1024, targetMB); err != nil {
|
|
slog.Warn("failed to expand template to default size (non-fatal)", "error", err)
|
|
}
|
|
|
|
sizeBytes, err := snapshot.DirSize(flattenDstDir, "")
|
|
if err != nil {
|
|
slog.Warn("failed to calculate template size", "error", err)
|
|
}
|
|
|
|
slog.Info("rootfs flattened to image-only template",
|
|
"sandbox", sandboxID,
|
|
"team_id", teamID,
|
|
"template_id", templateID,
|
|
"size_bytes", sizeBytes,
|
|
)
|
|
return sizeBytes, nil
|
|
}
|
|
|
|
// cleanupDM tears down the dm-snapshot device and releases the base image loop device.
|
|
func (m *Manager) cleanupDM(sb *sandboxState) {
|
|
if sb.dmDevice != nil {
|
|
if err := devicemapper.RemoveSnapshot(context.Background(), sb.dmDevice); err != nil {
|
|
slog.Warn("dm-snapshot remove error", "id", sb.ID, "error", err)
|
|
}
|
|
os.Remove(sb.dmDevice.CowPath)
|
|
}
|
|
if sb.baseImagePath != "" {
|
|
m.loops.Release(sb.baseImagePath)
|
|
}
|
|
}
|
|
|
|
// DeleteSnapshot removes a snapshot template from disk.
|
|
func (m *Manager) DeleteSnapshot(teamID, templateID pgtype.UUID) error {
|
|
return os.RemoveAll(layout.TemplateDir(m.cfg.WrennDir, teamID, templateID))
|
|
}
|
|
|
|
// createFromSnapshot creates a new sandbox by restoring from a snapshot template.
|
|
// CH handles memory restore internally (with on-demand paging).
|
|
// The template's rootfs.ext4 is a flattened standalone image — we create a
|
|
// dm-snapshot on top of it just like a normal Create.
|
|
func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID string, teamID, templateID pgtype.UUID, vcpus, _, timeoutSec, diskSizeMB int) (*models.Sandbox, error) {
|
|
tmplDir := layout.TemplateDir(m.cfg.WrennDir, teamID, templateID)
|
|
|
|
// Set up dm-snapshot on the template's flattened rootfs.
|
|
baseRootfs := filepath.Join(tmplDir, snapshot.RootfsFileName)
|
|
originLoop, err := m.loops.Acquire(baseRootfs)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("acquire loop device: %w", err)
|
|
}
|
|
|
|
originSize, err := devicemapper.OriginSizeBytes(originLoop)
|
|
if err != nil {
|
|
m.loops.Release(baseRootfs)
|
|
return nil, fmt.Errorf("get origin size: %w", err)
|
|
}
|
|
|
|
dmName := "wrenn-" + sandboxID
|
|
cowPath := filepath.Join(layout.SandboxesDir(m.cfg.WrennDir), fmt.Sprintf("%s.cow", sandboxID))
|
|
cowSize := max(int64(diskSizeMB)*1024*1024, originSize)
|
|
dmDev, err := devicemapper.CreateSnapshot(dmName, originLoop, cowPath, originSize, cowSize)
|
|
if err != nil {
|
|
m.loops.Release(baseRootfs)
|
|
return nil, fmt.Errorf("create dm-snapshot: %w", err)
|
|
}
|
|
|
|
// Allocate network.
|
|
slotIdx, err := m.slots.Allocate()
|
|
if err != nil {
|
|
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev))
|
|
os.Remove(cowPath)
|
|
m.loops.Release(baseRootfs)
|
|
return nil, fmt.Errorf("allocate network slot: %w", err)
|
|
}
|
|
slot := network.NewSlot(slotIdx)
|
|
|
|
if err := network.CreateNetwork(slot); err != nil {
|
|
m.slots.Release(slotIdx)
|
|
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev))
|
|
os.Remove(cowPath)
|
|
m.loops.Release(baseRootfs)
|
|
return nil, fmt.Errorf("create network: %w", err)
|
|
}
|
|
|
|
// Restore VM from CH snapshot.
|
|
vmCfg := vm.VMConfig{
|
|
SandboxID: sandboxID,
|
|
TemplateID: id.UUIDString(templateID),
|
|
KernelPath: m.cfg.KernelPath,
|
|
RootfsPath: dmDev.DevicePath,
|
|
VCPUs: vcpus,
|
|
NetworkNamespace: slot.NamespaceID,
|
|
TapDevice: slot.TapName,
|
|
TapMAC: slot.TapMAC,
|
|
GuestIP: slot.GuestIP,
|
|
GatewayIP: slot.TapIP,
|
|
NetMask: slot.GuestNetMask,
|
|
VMMBin: m.cfg.VMMBin,
|
|
}
|
|
|
|
if _, err := m.vm.CreateFromSnapshot(ctx, vmCfg, tmplDir); err != nil {
|
|
warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot))
|
|
m.slots.Release(slotIdx)
|
|
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev))
|
|
os.Remove(cowPath)
|
|
m.loops.Release(baseRootfs)
|
|
return nil, fmt.Errorf("restore VM from snapshot: %w", err)
|
|
}
|
|
|
|
// Wait for envd.
|
|
client := envdclient.New(slot.HostIP.String())
|
|
waitCtx, waitCancel := context.WithTimeout(ctx, m.cfg.EnvdTimeout)
|
|
|
|
if err := client.WaitUntilReady(waitCtx); err != nil {
|
|
waitCancel()
|
|
warnErr("vm destroy error", sandboxID, m.vm.Destroy(context.Background(), sandboxID))
|
|
warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot))
|
|
m.slots.Release(slotIdx)
|
|
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev))
|
|
os.Remove(cowPath)
|
|
m.loops.Release(baseRootfs)
|
|
return nil, fmt.Errorf("wait for envd: %w", err)
|
|
}
|
|
waitCancel()
|
|
|
|
// PostInit with sandbox_id and template_id so envd sets metadata.
|
|
initCtx, initCancel := context.WithTimeout(ctx, m.cfg.EnvdTimeout)
|
|
defer initCancel()
|
|
|
|
if err := client.PostInitWithDefaults(initCtx, "", nil, sandboxID, id.UUIDString(templateID)); err != nil {
|
|
slog.Warn("post-init failed after template restore, metadata may be stale", "sandbox", sandboxID, "error", err)
|
|
}
|
|
|
|
// Deflate balloon — template snapshot was taken with an inflated balloon.
|
|
if err := m.vm.UpdateBalloon(ctx, sandboxID, 0); err != nil {
|
|
slog.Debug("create-from-snapshot: balloon deflate failed (non-fatal)", "id", sandboxID, "error", err)
|
|
}
|
|
|
|
// Fetch envd version (best-effort).
|
|
envdVersion, _ := client.FetchVersion(ctx)
|
|
|
|
now := time.Now()
|
|
sb := &sandboxState{
|
|
Sandbox: models.Sandbox{
|
|
ID: sandboxID,
|
|
Status: models.StatusRunning,
|
|
TemplateTeamID: teamID.Bytes,
|
|
TemplateID: templateID.Bytes,
|
|
VCPUs: vcpus,
|
|
TimeoutSec: timeoutSec,
|
|
SlotIndex: slotIdx,
|
|
HostIP: slot.HostIP,
|
|
RootfsPath: dmDev.DevicePath,
|
|
CreatedAt: now,
|
|
LastActiveAt: now,
|
|
Metadata: m.buildMetadata(envdVersion),
|
|
},
|
|
slot: slot,
|
|
client: client,
|
|
connTracker: &ConnTracker{},
|
|
dmDevice: dmDev,
|
|
baseImagePath: baseRootfs,
|
|
}
|
|
|
|
m.mu.Lock()
|
|
m.boxes[sandboxID] = sb
|
|
m.mu.Unlock()
|
|
|
|
m.startSampler(sb)
|
|
m.startCrashWatcher(sb)
|
|
|
|
slog.Info("sandbox created from snapshot",
|
|
"id", sandboxID,
|
|
"team_id", teamID,
|
|
"template_id", templateID,
|
|
"host_ip", slot.HostIP.String(),
|
|
"dm_device", dmDev.DevicePath,
|
|
)
|
|
|
|
return &sb.Sandbox, nil
|
|
}
|
|
|
|
// Exec runs a command inside a sandbox.
|
|
func (m *Manager) Exec(ctx context.Context, sandboxID string, cmd string, args ...string) (*envdclient.ExecResult, error) {
|
|
sb, err := m.get(sandboxID)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if sb.Status != models.StatusRunning {
|
|
return nil, fmt.Errorf("sandbox %s is not running (status: %s)", sandboxID, sb.Status)
|
|
}
|
|
|
|
m.mu.Lock()
|
|
sb.LastActiveAt = time.Now()
|
|
m.mu.Unlock()
|
|
|
|
return sb.client.Exec(ctx, cmd, args...)
|
|
}
|
|
|
|
// ExecStream runs a command inside a sandbox and returns a channel of streaming events.
|
|
func (m *Manager) ExecStream(ctx context.Context, sandboxID string, cmd string, args ...string) (<-chan envdclient.ExecStreamEvent, error) {
|
|
sb, err := m.get(sandboxID)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if sb.Status != models.StatusRunning {
|
|
return nil, fmt.Errorf("sandbox %s is not running (status: %s)", sandboxID, sb.Status)
|
|
}
|
|
|
|
m.mu.Lock()
|
|
sb.LastActiveAt = time.Now()
|
|
m.mu.Unlock()
|
|
|
|
return sb.client.ExecStream(ctx, cmd, args...)
|
|
}
|
|
|
|
// List returns all sandboxes.
|
|
func (m *Manager) List() []models.Sandbox {
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
|
|
result := make([]models.Sandbox, 0, len(m.boxes))
|
|
for _, sb := range m.boxes {
|
|
result = append(result, sb.Sandbox)
|
|
}
|
|
return result
|
|
}
|
|
|
|
// Get returns a sandbox by ID.
|
|
func (m *Manager) Get(sandboxID string) (*models.Sandbox, error) {
|
|
sb, err := m.get(sandboxID)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return &sb.Sandbox, nil
|
|
}
|
|
|
|
// GetClient returns the envd client for a sandbox.
|
|
func (m *Manager) GetClient(sandboxID string) (*envdclient.Client, error) {
|
|
sb, err := m.get(sandboxID)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if sb.Status != models.StatusRunning {
|
|
return nil, fmt.Errorf("sandbox %s is not running (status: %s)", sandboxID, sb.Status)
|
|
}
|
|
return sb.client, nil
|
|
}
|
|
|
|
// SetDefaults calls envd's PostInit to configure the default user and
|
|
// environment variables for a running sandbox. This is called by the host
|
|
// agent after sandbox creation or resume when the template specifies defaults.
|
|
func (m *Manager) SetDefaults(ctx context.Context, sandboxID, defaultUser string, defaultEnv map[string]string) error {
|
|
if defaultUser == "" && len(defaultEnv) == 0 {
|
|
return nil
|
|
}
|
|
sb, err := m.get(sandboxID)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if sb.Status != models.StatusRunning {
|
|
return fmt.Errorf("sandbox %s is not running (status: %s)", sandboxID, sb.Status)
|
|
}
|
|
return sb.client.PostInitWithDefaults(ctx, defaultUser, defaultEnv, "", "")
|
|
}
|
|
|
|
// PtyAttach starts a new PTY process or reconnects to an existing one.
|
|
// If cmd is non-empty, starts a new process. If empty, reconnects using tag.
|
|
func (m *Manager) PtyAttach(ctx context.Context, sandboxID, tag, cmd string, args []string, cols, rows uint32, envs map[string]string, cwd string) (<-chan envdclient.PtyEvent, error) {
|
|
sb, err := m.get(sandboxID)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if sb.Status != models.StatusRunning {
|
|
return nil, fmt.Errorf("sandbox %s is not running (status: %s)", sandboxID, sb.Status)
|
|
}
|
|
|
|
m.mu.Lock()
|
|
sb.LastActiveAt = time.Now()
|
|
m.mu.Unlock()
|
|
|
|
if cmd != "" {
|
|
return sb.client.PtyStart(ctx, tag, cmd, args, cols, rows, envs, cwd)
|
|
}
|
|
return sb.client.PtyConnect(ctx, tag)
|
|
}
|
|
|
|
// PtySendInput sends raw bytes to a PTY process in a sandbox.
|
|
func (m *Manager) PtySendInput(ctx context.Context, sandboxID, tag string, data []byte) error {
|
|
sb, err := m.get(sandboxID)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if sb.Status != models.StatusRunning {
|
|
return fmt.Errorf("sandbox %s is not running (status: %s)", sandboxID, sb.Status)
|
|
}
|
|
|
|
m.mu.Lock()
|
|
sb.LastActiveAt = time.Now()
|
|
m.mu.Unlock()
|
|
|
|
return sb.client.PtySendInput(ctx, tag, data)
|
|
}
|
|
|
|
// PtyResize updates the terminal dimensions for a PTY process in a sandbox.
|
|
func (m *Manager) PtyResize(ctx context.Context, sandboxID, tag string, cols, rows uint32) error {
|
|
sb, err := m.get(sandboxID)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if sb.Status != models.StatusRunning {
|
|
return fmt.Errorf("sandbox %s is not running (status: %s)", sandboxID, sb.Status)
|
|
}
|
|
|
|
return sb.client.PtyResize(ctx, tag, cols, rows)
|
|
}
|
|
|
|
// PtyKill sends SIGKILL to a PTY process in a sandbox.
|
|
func (m *Manager) PtyKill(ctx context.Context, sandboxID, tag string) error {
|
|
sb, err := m.get(sandboxID)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if sb.Status != models.StatusRunning {
|
|
return fmt.Errorf("sandbox %s is not running (status: %s)", sandboxID, sb.Status)
|
|
}
|
|
|
|
return sb.client.PtyKill(ctx, tag)
|
|
}
|
|
|
|
// StartBackground starts a background process inside a sandbox.
|
|
func (m *Manager) StartBackground(ctx context.Context, sandboxID, tag, cmd string, args []string, envs map[string]string, cwd string) (uint32, error) {
|
|
sb, err := m.get(sandboxID)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
if sb.Status != models.StatusRunning {
|
|
return 0, fmt.Errorf("sandbox %s is not running (status: %s)", sandboxID, sb.Status)
|
|
}
|
|
|
|
m.mu.Lock()
|
|
sb.LastActiveAt = time.Now()
|
|
m.mu.Unlock()
|
|
|
|
return sb.client.StartBackground(ctx, tag, cmd, args, envs, cwd)
|
|
}
|
|
|
|
// ConnectProcess re-attaches to a running process inside a sandbox.
|
|
func (m *Manager) ConnectProcess(ctx context.Context, sandboxID string, pid uint32, tag string) (<-chan envdclient.ExecStreamEvent, error) {
|
|
sb, err := m.get(sandboxID)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if sb.Status != models.StatusRunning {
|
|
return nil, fmt.Errorf("sandbox %s is not running (status: %s)", sandboxID, sb.Status)
|
|
}
|
|
|
|
m.mu.Lock()
|
|
sb.LastActiveAt = time.Now()
|
|
m.mu.Unlock()
|
|
|
|
return sb.client.ConnectProcess(ctx, pid, tag)
|
|
}
|
|
|
|
// ListProcesses returns all running processes inside a sandbox.
|
|
func (m *Manager) ListProcesses(ctx context.Context, sandboxID string) ([]envdclient.ProcessInfo, error) {
|
|
sb, err := m.get(sandboxID)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if sb.Status != models.StatusRunning {
|
|
return nil, fmt.Errorf("sandbox %s is not running (status: %s)", sandboxID, sb.Status)
|
|
}
|
|
|
|
m.mu.Lock()
|
|
sb.LastActiveAt = time.Now()
|
|
m.mu.Unlock()
|
|
|
|
return sb.client.ListProcesses(ctx)
|
|
}
|
|
|
|
// KillProcess sends a signal to a process inside a sandbox.
|
|
func (m *Manager) KillProcess(ctx context.Context, sandboxID string, pid uint32, tag string, signal envdpb.Signal) error {
|
|
sb, err := m.get(sandboxID)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if sb.Status != models.StatusRunning {
|
|
return fmt.Errorf("sandbox %s is not running (status: %s)", sandboxID, sb.Status)
|
|
}
|
|
|
|
m.mu.Lock()
|
|
sb.LastActiveAt = time.Now()
|
|
m.mu.Unlock()
|
|
|
|
return sb.client.KillProcess(ctx, pid, tag, signal)
|
|
}
|
|
|
|
// AcquireProxyConn atomically looks up a sandbox by ID and registers an
|
|
// in-flight proxy connection. Returns the sandbox's host-reachable IP, the
|
|
// connection tracker, and true on success. The caller must call
|
|
// tracker.Release() when the request completes. Returns zero values and
|
|
// false if the sandbox is not found, not running, or is draining for a pause.
|
|
func (m *Manager) AcquireProxyConn(sandboxID string) (net.IP, *ConnTracker, bool) {
|
|
m.mu.RLock()
|
|
sb, ok := m.boxes[sandboxID]
|
|
m.mu.RUnlock()
|
|
|
|
if !ok || sb.Status != models.StatusRunning {
|
|
return nil, nil, false
|
|
}
|
|
if !sb.connTracker.Acquire() {
|
|
return nil, nil, false
|
|
}
|
|
return sb.HostIP, sb.connTracker, true
|
|
}
|
|
|
|
// Ping resets the inactivity timer for a running sandbox.
|
|
func (m *Manager) Ping(sandboxID string) error {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
sb, ok := m.boxes[sandboxID]
|
|
if !ok {
|
|
return fmt.Errorf("%w: %s", ErrNotFound, sandboxID)
|
|
}
|
|
if sb.Status != models.StatusRunning {
|
|
return fmt.Errorf("sandbox %s is not running (status: %s)", sandboxID, sb.Status)
|
|
}
|
|
sb.LastActiveAt = time.Now()
|
|
return nil
|
|
}
|
|
|
|
// DrainAutoPausedIDs returns and clears the list of sandbox IDs that were
|
|
// automatically paused by the TTL reaper since the last call.
|
|
func (m *Manager) DrainAutoPausedIDs() []string {
|
|
m.autoPausedMu.Lock()
|
|
defer m.autoPausedMu.Unlock()
|
|
|
|
ids := m.autoPausedIDs
|
|
m.autoPausedIDs = nil
|
|
return ids
|
|
}
|
|
|
|
func (m *Manager) get(sandboxID string) (*sandboxState, error) {
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
|
|
sb, ok := m.boxes[sandboxID]
|
|
if !ok {
|
|
return nil, fmt.Errorf("%w: %s", ErrNotFound, sandboxID)
|
|
}
|
|
return sb, nil
|
|
}
|
|
|
|
// StartTTLReaper starts a background goroutine that destroys sandboxes
|
|
// that have exceeded their TTL (timeout_sec of inactivity).
|
|
func (m *Manager) StartTTLReaper(ctx context.Context) {
|
|
go func() {
|
|
ticker := time.NewTicker(2 * time.Second)
|
|
defer ticker.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-m.stopCh:
|
|
return
|
|
case <-ticker.C:
|
|
m.reapExpired(ctx)
|
|
}
|
|
}
|
|
}()
|
|
}
|
|
|
|
func (m *Manager) reapExpired(_ context.Context) {
|
|
m.mu.RLock()
|
|
var expired []string
|
|
now := time.Now()
|
|
for id, sb := range m.boxes {
|
|
if sb.TimeoutSec <= 0 {
|
|
continue
|
|
}
|
|
if sb.Status != models.StatusRunning {
|
|
continue
|
|
}
|
|
if now.Sub(sb.LastActiveAt) > time.Duration(sb.TimeoutSec)*time.Second {
|
|
expired = append(expired, id)
|
|
}
|
|
}
|
|
m.mu.RUnlock()
|
|
|
|
for _, id := range expired {
|
|
slog.Info("TTL expired, auto-pausing sandbox", "id", id)
|
|
// Use a detached context so that an app shutdown does not cancel
|
|
// a pause mid-flight, which would leave the VM frozen without a
|
|
// valid snapshot.
|
|
pauseCtx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
|
|
err := m.Pause(pauseCtx, id)
|
|
cancel()
|
|
if err != nil {
|
|
slog.Warn("TTL auto-pause failed, destroying sandbox", "id", id, "error", err)
|
|
if destroyErr := m.Destroy(context.Background(), id); destroyErr != nil {
|
|
slog.Warn("TTL destroy after failed pause also failed", "id", id, "error", destroyErr)
|
|
} else if m.eventSender != nil {
|
|
m.eventSender.SendAsync(LifecycleEvent{
|
|
Event: "sandbox.stopped",
|
|
SandboxID: id,
|
|
})
|
|
}
|
|
continue
|
|
}
|
|
m.autoPausedMu.Lock()
|
|
m.autoPausedIDs = append(m.autoPausedIDs, id)
|
|
m.autoPausedMu.Unlock()
|
|
|
|
if m.eventSender != nil {
|
|
m.eventSender.SendAsync(LifecycleEvent{
|
|
Event: "sandbox.auto_paused",
|
|
SandboxID: id,
|
|
})
|
|
}
|
|
}
|
|
}
|
|
|
|
// Shutdown destroys all sandboxes, releases loop devices, and stops the TTL reaper.
|
|
func (m *Manager) Shutdown(ctx context.Context) {
|
|
close(m.stopCh)
|
|
|
|
m.mu.Lock()
|
|
ids := make([]string, 0, len(m.boxes))
|
|
for id := range m.boxes {
|
|
ids = append(ids, id)
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
for _, sbID := range ids {
|
|
slog.Info("shutdown: destroying sandbox", "id", sbID)
|
|
if err := m.Destroy(ctx, sbID); err != nil {
|
|
slog.Warn("shutdown destroy failed", "id", sbID, "error", err)
|
|
}
|
|
}
|
|
|
|
m.loops.ReleaseAll()
|
|
}
|
|
|
|
// PauseAll pauses every running sandbox managed by this host agent.
|
|
// Called when the host loses connectivity to the control plane to avoid
|
|
// leaving running VMs unmanaged. It is best-effort: failures for individual
|
|
// sandboxes are logged but do not stop the rest.
|
|
func (m *Manager) PauseAll(ctx context.Context) {
|
|
m.mu.RLock()
|
|
ids := make([]string, 0, len(m.boxes))
|
|
for id, sb := range m.boxes {
|
|
if sb.Status == models.StatusRunning {
|
|
ids = append(ids, id)
|
|
}
|
|
}
|
|
m.mu.RUnlock()
|
|
|
|
slog.Info("pausing all running sandboxes due to CP connection loss", "count", len(ids))
|
|
for _, sbID := range ids {
|
|
if err := m.Pause(ctx, sbID); err != nil {
|
|
slog.Warn("PauseAll: failed to pause sandbox", "id", sbID, "error", err)
|
|
}
|
|
}
|
|
}
|
|
|
|
// warnErr logs a warning if err is non-nil. Used for best-effort cleanup
|
|
// in error paths where the primary error has already been captured.
|
|
func warnErr(msg string, id string, err error) {
|
|
if err != nil {
|
|
slog.Warn(msg, "id", id, "error", err)
|
|
}
|
|
}
|
|
|
|
// startCrashWatcher monitors the VM process for unexpected exits.
|
|
// If the process exits while the sandbox is still in m.boxes (i.e. not a
|
|
// deliberate Destroy), the sandbox is cleaned up and a sandbox.error event
|
|
// is pushed to the control plane.
|
|
func (m *Manager) startCrashWatcher(sb *sandboxState) {
|
|
v, ok := m.vm.Get(sb.ID)
|
|
if !ok {
|
|
return
|
|
}
|
|
go func() {
|
|
select {
|
|
case <-v.Exited():
|
|
case <-m.stopCh:
|
|
return
|
|
}
|
|
|
|
// Check if this was a deliberate Destroy/Pause (sandbox already removed
|
|
// from boxes, or Pause owns the cleanup).
|
|
m.mu.Lock()
|
|
_, stillAlive := m.boxes[sb.ID]
|
|
if stillAlive && sb.Status == models.StatusPausing {
|
|
stillAlive = false
|
|
}
|
|
if stillAlive {
|
|
delete(m.boxes, sb.ID)
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
if !stillAlive {
|
|
return
|
|
}
|
|
|
|
slog.Error("VM process crashed, cleaning up", "id", sb.ID)
|
|
|
|
sb.lifecycleMu.Lock()
|
|
m.cleanupAfterCrash(sb)
|
|
sb.lifecycleMu.Unlock()
|
|
|
|
if m.onDestroy != nil {
|
|
m.onDestroy(sb.ID)
|
|
}
|
|
|
|
if m.eventSender != nil {
|
|
m.eventSender.SendAsync(LifecycleEvent{
|
|
Event: "sandbox.error",
|
|
SandboxID: sb.ID,
|
|
})
|
|
}
|
|
}()
|
|
}
|
|
|
|
// cleanupAfterCrash tears down sandbox resources after a VM crash.
|
|
// The VM process is already dead so we skip vm.Destroy and just clean up
|
|
// network, device-mapper, and loop devices.
|
|
func (m *Manager) cleanupAfterCrash(sb *sandboxState) {
|
|
m.stopSampler(sb)
|
|
|
|
// Remove the VM from the vm.Manager's map (process is already dead).
|
|
_ = m.vm.Destroy(context.Background(), sb.ID)
|
|
|
|
if err := network.RemoveNetwork(sb.slot); err != nil {
|
|
slog.Warn("crash cleanup: network error", "id", sb.ID, "error", err)
|
|
}
|
|
m.slots.Release(sb.SlotIndex)
|
|
|
|
if sb.dmDevice != nil {
|
|
if err := devicemapper.RemoveSnapshot(context.Background(), sb.dmDevice); err != nil {
|
|
slog.Warn("crash cleanup: dm-snapshot error", "id", sb.ID, "error", err)
|
|
}
|
|
os.Remove(sb.dmDevice.CowPath)
|
|
}
|
|
if sb.baseImagePath != "" {
|
|
m.loops.Release(sb.baseImagePath)
|
|
}
|
|
}
|
|
|
|
// startSampler resolves the VMM PID and starts a background goroutine
|
|
// that samples CPU/mem/disk at 1s intervals into the ring buffer.
|
|
// Must be called after the sandbox is registered in m.boxes.
|
|
func (m *Manager) startSampler(sb *sandboxState) {
|
|
v, ok := m.vm.Get(sb.ID)
|
|
if !ok {
|
|
slog.Warn("metrics: VM not found, skipping sampler", "id", sb.ID)
|
|
return
|
|
}
|
|
|
|
// v.PID() is the cmd.Process.Pid of the "unshare -m -- bash -c script"
|
|
// invocation. The exec chain (unshare → bash → ip netns exec → cloud-hypervisor)
|
|
// occupies the same PID. v.PID() IS the VMM PID.
|
|
vmmPID := v.PID()
|
|
|
|
sb.vmmPID = vmmPID
|
|
sb.ring = newMetricsRing()
|
|
|
|
ctx, cancel := context.WithCancel(context.Background())
|
|
sb.samplerCancel = cancel
|
|
sb.samplerDone = make(chan struct{})
|
|
|
|
// Read initial CPU counters for delta calculation.
|
|
// Passed to goroutine as local state — no shared mutation.
|
|
initialCPU, err := readCPUStat(vmmPID)
|
|
if err != nil {
|
|
slog.Warn("metrics: could not read initial CPU stat", "id", sb.ID, "error", err)
|
|
}
|
|
|
|
go m.samplerLoop(ctx, sb, vmmPID, sb.VCPUs, initialCPU)
|
|
}
|
|
|
|
// samplerLoop samples metrics at 1s intervals.
|
|
// lastCPU is goroutine-local to avoid shared-state races.
|
|
func (m *Manager) samplerLoop(ctx context.Context, sb *sandboxState, vmmPID, vcpus int, lastCPU cpuStat) {
|
|
defer close(sb.samplerDone)
|
|
|
|
ticker := time.NewTicker(1 * time.Second)
|
|
defer ticker.Stop()
|
|
|
|
clkTck := 100.0 // sysconf(_SC_CLK_TCK), almost always 100 on Linux
|
|
lastTime := time.Now()
|
|
cpuInitialized := lastCPU != (cpuStat{})
|
|
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case now := <-ticker.C:
|
|
elapsed := now.Sub(lastTime).Seconds()
|
|
lastTime = now
|
|
|
|
// CPU: delta jiffies / (elapsed * CLK_TCK * vcpus) * 100
|
|
var cpuPct float64
|
|
cur, err := readCPUStat(vmmPID)
|
|
if err == nil {
|
|
if cpuInitialized && elapsed > 0 && vcpus > 0 {
|
|
deltaJiffies := float64((cur.utime + cur.stime) - (lastCPU.utime + lastCPU.stime))
|
|
cpuPct = (deltaJiffies / (elapsed * clkTck * float64(vcpus))) * 100.0
|
|
if cpuPct > 100.0 {
|
|
cpuPct = 100.0
|
|
}
|
|
if cpuPct < 0 {
|
|
cpuPct = 0
|
|
}
|
|
}
|
|
lastCPU = cur
|
|
cpuInitialized = true
|
|
}
|
|
|
|
// Memory: guest-reported used memory from envd /metrics.
|
|
// VmRSS of the VMM process includes guest page cache and never
|
|
// decreases, so we use the guest's own view which reports
|
|
// total - available (actual process memory).
|
|
memBytes, _ := readEnvdMemUsed(ctx, sb.client)
|
|
|
|
// Disk: allocated bytes of the CoW sparse file.
|
|
var diskBytes int64
|
|
if sb.dmDevice != nil {
|
|
diskBytes, _ = readDiskAllocated(sb.dmDevice.CowPath)
|
|
}
|
|
|
|
sb.ring.Push(MetricPoint{
|
|
Timestamp: now,
|
|
CPUPct: cpuPct,
|
|
MemBytes: memBytes,
|
|
DiskBytes: diskBytes,
|
|
})
|
|
}
|
|
}
|
|
}
|
|
|
|
// stopSampler stops the metrics sampling goroutine and waits for it to exit.
|
|
func (m *Manager) stopSampler(sb *sandboxState) {
|
|
if sb.samplerCancel != nil {
|
|
sb.samplerCancel()
|
|
<-sb.samplerDone
|
|
sb.samplerCancel = nil
|
|
}
|
|
}
|
|
|
|
// GetMetrics returns the ring buffer data for the given range tier.
|
|
// Valid ranges: "10m", "2h", "24h".
|
|
func (m *Manager) GetMetrics(sandboxID, rangeTier string) ([]MetricPoint, error) {
|
|
m.mu.RLock()
|
|
sb, ok := m.boxes[sandboxID]
|
|
m.mu.RUnlock()
|
|
if !ok {
|
|
return nil, fmt.Errorf("%w: %s", ErrNotFound, sandboxID)
|
|
}
|
|
if sb.ring == nil {
|
|
return nil, nil
|
|
}
|
|
|
|
// Map the requested range to the appropriate ring tier and time cutoff.
|
|
var points []MetricPoint
|
|
var cutoff time.Duration
|
|
switch rangeTier {
|
|
case "5m":
|
|
points = sb.ring.Get10m()
|
|
cutoff = 5 * time.Minute
|
|
case "10m":
|
|
points = sb.ring.Get10m()
|
|
cutoff = 10 * time.Minute
|
|
case "1h":
|
|
points = sb.ring.Get2h()
|
|
cutoff = 1 * time.Hour
|
|
case "2h":
|
|
points = sb.ring.Get2h()
|
|
cutoff = 2 * time.Hour
|
|
case "6h":
|
|
points = sb.ring.Get24h()
|
|
cutoff = 6 * time.Hour
|
|
case "12h":
|
|
points = sb.ring.Get24h()
|
|
cutoff = 12 * time.Hour
|
|
case "24h":
|
|
points = sb.ring.Get24h()
|
|
cutoff = 24 * time.Hour
|
|
default:
|
|
return nil, fmt.Errorf("invalid range: %s (valid: 5m, 10m, 1h, 2h, 6h, 12h, 24h)", rangeTier)
|
|
}
|
|
|
|
// Filter points to the requested time window.
|
|
threshold := time.Now().Add(-cutoff)
|
|
filtered := points[:0:0]
|
|
for _, p := range points {
|
|
if !p.Timestamp.Before(threshold) {
|
|
filtered = append(filtered, p)
|
|
}
|
|
}
|
|
return filtered, nil
|
|
}
|
|
|
|
// FlushMetrics returns all three tier ring buffers, clears the ring, and
|
|
// stops the sampler goroutine. Called by the control plane before pause/destroy.
|
|
func (m *Manager) FlushMetrics(sandboxID string) (pts10m, pts2h, pts24h []MetricPoint, err error) {
|
|
m.mu.RLock()
|
|
sb, ok := m.boxes[sandboxID]
|
|
m.mu.RUnlock()
|
|
if !ok {
|
|
return nil, nil, nil, fmt.Errorf("%w: %s", ErrNotFound, sandboxID)
|
|
}
|
|
|
|
m.stopSampler(sb)
|
|
if sb.ring == nil {
|
|
return nil, nil, nil, nil
|
|
}
|
|
pts10m, pts2h, pts24h = sb.ring.Flush()
|
|
return pts10m, pts2h, pts24h, nil
|
|
}
|
|
|
|
// copyFile copies a regular file from src to dst using streaming I/O.
|
|
func copyFile(src, dst string) error {
|
|
sf, err := os.Open(src)
|
|
if err != nil {
|
|
return fmt.Errorf("open %s: %w", src, err)
|
|
}
|
|
defer sf.Close()
|
|
|
|
df, err := os.Create(dst)
|
|
if err != nil {
|
|
return fmt.Errorf("create %s: %w", dst, err)
|
|
}
|
|
defer df.Close()
|
|
|
|
if _, err := df.ReadFrom(sf); err != nil {
|
|
os.Remove(dst)
|
|
return fmt.Errorf("copy %s → %s: %w", src, dst, err)
|
|
}
|
|
return nil
|
|
}
|