forked from wrenn/wrenn
Implement the host agent as a Connect RPC server that orchestrates sandbox creation, destruction, pause/resume, and command execution. Includes sandbox manager with TTL-based reaper, network slot allocator, rootfs cloning, hostagent proto definition with generated stubs, and test/debug scripts. Fix Firecracker process lifetime bug where VM was tied to HTTP request context instead of background context.
357 lines
8.6 KiB
Go
357 lines
8.6 KiB
Go
package sandbox
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"log/slog"
|
|
"os"
|
|
"path/filepath"
|
|
"sync"
|
|
"time"
|
|
|
|
"git.omukk.dev/wrenn/sandbox/internal/envdclient"
|
|
"git.omukk.dev/wrenn/sandbox/internal/filesystem"
|
|
"git.omukk.dev/wrenn/sandbox/internal/id"
|
|
"git.omukk.dev/wrenn/sandbox/internal/models"
|
|
"git.omukk.dev/wrenn/sandbox/internal/network"
|
|
"git.omukk.dev/wrenn/sandbox/internal/vm"
|
|
)
|
|
|
|
// Config holds the paths and defaults for the sandbox manager.
|
|
type Config struct {
|
|
KernelPath string
|
|
ImagesDir string // directory containing base rootfs images (e.g., /var/lib/wrenn/images/minimal.ext4)
|
|
SandboxesDir string // directory for per-sandbox rootfs clones (e.g., /var/lib/wrenn/sandboxes)
|
|
EnvdTimeout time.Duration
|
|
}
|
|
|
|
// Manager orchestrates sandbox lifecycle: VM, network, filesystem, envd.
|
|
type Manager struct {
|
|
cfg Config
|
|
vm *vm.Manager
|
|
slots *network.SlotAllocator
|
|
mu sync.RWMutex
|
|
boxes map[string]*sandboxState
|
|
stopCh chan struct{}
|
|
}
|
|
|
|
// sandboxState holds the runtime state for a single sandbox.
|
|
type sandboxState struct {
|
|
models.Sandbox
|
|
slot *network.Slot
|
|
client *envdclient.Client
|
|
}
|
|
|
|
// New creates a new sandbox manager.
|
|
func New(cfg Config) *Manager {
|
|
if cfg.EnvdTimeout == 0 {
|
|
cfg.EnvdTimeout = 30 * time.Second
|
|
}
|
|
return &Manager{
|
|
cfg: cfg,
|
|
vm: vm.NewManager(),
|
|
slots: network.NewSlotAllocator(),
|
|
boxes: make(map[string]*sandboxState),
|
|
stopCh: make(chan struct{}),
|
|
}
|
|
}
|
|
|
|
// Create boots a new sandbox: clone rootfs, set up network, start VM, wait for envd.
|
|
func (m *Manager) Create(ctx context.Context, template string, vcpus, memoryMB, timeoutSec int) (*models.Sandbox, error) {
|
|
sandboxID := id.NewSandboxID()
|
|
|
|
if vcpus <= 0 {
|
|
vcpus = 1
|
|
}
|
|
if memoryMB <= 0 {
|
|
memoryMB = 512
|
|
}
|
|
|
|
if template == "" {
|
|
template = "minimal"
|
|
}
|
|
|
|
// Resolve base rootfs image: /var/lib/wrenn/images/{template}.ext4
|
|
baseRootfs := filepath.Join(m.cfg.ImagesDir, template+".ext4")
|
|
if _, err := os.Stat(baseRootfs); err != nil {
|
|
return nil, fmt.Errorf("base rootfs not found at %s: %w", baseRootfs, err)
|
|
}
|
|
|
|
// Clone rootfs.
|
|
rootfsPath := filepath.Join(m.cfg.SandboxesDir, fmt.Sprintf("%s-%s.ext4", sandboxID, template))
|
|
if err := filesystem.CloneRootfs(baseRootfs, rootfsPath); err != nil {
|
|
return nil, fmt.Errorf("clone rootfs: %w", err)
|
|
}
|
|
|
|
// Allocate network slot.
|
|
slotIdx, err := m.slots.Allocate()
|
|
if err != nil {
|
|
os.Remove(rootfsPath)
|
|
return nil, fmt.Errorf("allocate network slot: %w", err)
|
|
}
|
|
slot := network.NewSlot(slotIdx)
|
|
|
|
// Set up network.
|
|
if err := network.CreateNetwork(slot); err != nil {
|
|
m.slots.Release(slotIdx)
|
|
os.Remove(rootfsPath)
|
|
return nil, fmt.Errorf("create network: %w", err)
|
|
}
|
|
|
|
// Boot VM.
|
|
vmCfg := vm.VMConfig{
|
|
SandboxID: sandboxID,
|
|
KernelPath: m.cfg.KernelPath,
|
|
RootfsPath: rootfsPath,
|
|
VCPUs: vcpus,
|
|
MemoryMB: memoryMB,
|
|
NetworkNamespace: slot.NamespaceID,
|
|
TapDevice: slot.TapName,
|
|
TapMAC: slot.TapMAC,
|
|
GuestIP: slot.GuestIP,
|
|
GatewayIP: slot.TapIP,
|
|
NetMask: slot.GuestNetMask,
|
|
}
|
|
|
|
if _, err := m.vm.Create(ctx, vmCfg); err != nil {
|
|
network.RemoveNetwork(slot)
|
|
m.slots.Release(slotIdx)
|
|
os.Remove(rootfsPath)
|
|
return nil, fmt.Errorf("create VM: %w", err)
|
|
}
|
|
|
|
// Wait for envd to be ready.
|
|
client := envdclient.New(slot.HostIP.String())
|
|
waitCtx, waitCancel := context.WithTimeout(ctx, m.cfg.EnvdTimeout)
|
|
defer waitCancel()
|
|
|
|
if err := client.WaitUntilReady(waitCtx); err != nil {
|
|
m.vm.Destroy(context.Background(), sandboxID)
|
|
network.RemoveNetwork(slot)
|
|
m.slots.Release(slotIdx)
|
|
os.Remove(rootfsPath)
|
|
return nil, fmt.Errorf("wait for envd: %w", err)
|
|
}
|
|
|
|
now := time.Now()
|
|
sb := &sandboxState{
|
|
Sandbox: models.Sandbox{
|
|
ID: sandboxID,
|
|
Status: models.StatusRunning,
|
|
Template: template,
|
|
VCPUs: vcpus,
|
|
MemoryMB: memoryMB,
|
|
TimeoutSec: timeoutSec,
|
|
SlotIndex: slotIdx,
|
|
HostIP: slot.HostIP,
|
|
RootfsPath: rootfsPath,
|
|
CreatedAt: now,
|
|
LastActiveAt: now,
|
|
},
|
|
slot: slot,
|
|
client: client,
|
|
}
|
|
|
|
m.mu.Lock()
|
|
m.boxes[sandboxID] = sb
|
|
m.mu.Unlock()
|
|
|
|
slog.Info("sandbox created",
|
|
"id", sandboxID,
|
|
"template", template,
|
|
"host_ip", slot.HostIP.String(),
|
|
)
|
|
|
|
return &sb.Sandbox, nil
|
|
}
|
|
|
|
// Destroy stops and cleans up a sandbox.
|
|
func (m *Manager) Destroy(ctx context.Context, sandboxID string) error {
|
|
m.mu.Lock()
|
|
sb, ok := m.boxes[sandboxID]
|
|
if !ok {
|
|
m.mu.Unlock()
|
|
return fmt.Errorf("sandbox not found: %s", sandboxID)
|
|
}
|
|
delete(m.boxes, sandboxID)
|
|
m.mu.Unlock()
|
|
|
|
m.cleanup(ctx, sb)
|
|
|
|
slog.Info("sandbox destroyed", "id", sandboxID)
|
|
return nil
|
|
}
|
|
|
|
// cleanup tears down all resources for a sandbox.
|
|
func (m *Manager) cleanup(ctx context.Context, sb *sandboxState) {
|
|
if err := m.vm.Destroy(ctx, sb.ID); err != nil {
|
|
slog.Warn("vm destroy error", "id", sb.ID, "error", err)
|
|
}
|
|
if err := network.RemoveNetwork(sb.slot); err != nil {
|
|
slog.Warn("network cleanup error", "id", sb.ID, "error", err)
|
|
}
|
|
m.slots.Release(sb.SlotIndex)
|
|
os.Remove(sb.RootfsPath)
|
|
}
|
|
|
|
// Pause pauses a running sandbox.
|
|
func (m *Manager) Pause(ctx context.Context, sandboxID string) error {
|
|
sb, err := m.get(sandboxID)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if sb.Status != models.StatusRunning {
|
|
return fmt.Errorf("sandbox %s is not running (status: %s)", sandboxID, sb.Status)
|
|
}
|
|
|
|
if err := m.vm.Pause(ctx, sandboxID); err != nil {
|
|
return fmt.Errorf("pause VM: %w", err)
|
|
}
|
|
|
|
m.mu.Lock()
|
|
sb.Status = models.StatusPaused
|
|
m.mu.Unlock()
|
|
|
|
slog.Info("sandbox paused", "id", sandboxID)
|
|
return nil
|
|
}
|
|
|
|
// Resume resumes a paused sandbox.
|
|
func (m *Manager) Resume(ctx context.Context, sandboxID string) error {
|
|
sb, err := m.get(sandboxID)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if sb.Status != models.StatusPaused {
|
|
return fmt.Errorf("sandbox %s is not paused (status: %s)", sandboxID, sb.Status)
|
|
}
|
|
|
|
if err := m.vm.Resume(ctx, sandboxID); err != nil {
|
|
return fmt.Errorf("resume VM: %w", err)
|
|
}
|
|
|
|
m.mu.Lock()
|
|
sb.Status = models.StatusRunning
|
|
sb.LastActiveAt = time.Now()
|
|
m.mu.Unlock()
|
|
|
|
slog.Info("sandbox resumed", "id", sandboxID)
|
|
return nil
|
|
}
|
|
|
|
// Exec runs a command inside a sandbox.
|
|
func (m *Manager) Exec(ctx context.Context, sandboxID string, cmd string, args ...string) (*envdclient.ExecResult, error) {
|
|
sb, err := m.get(sandboxID)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if sb.Status != models.StatusRunning {
|
|
return nil, fmt.Errorf("sandbox %s is not running (status: %s)", sandboxID, sb.Status)
|
|
}
|
|
|
|
m.mu.Lock()
|
|
sb.LastActiveAt = time.Now()
|
|
m.mu.Unlock()
|
|
|
|
return sb.client.Exec(ctx, cmd, args...)
|
|
}
|
|
|
|
// List returns all sandboxes.
|
|
func (m *Manager) List() []models.Sandbox {
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
|
|
result := make([]models.Sandbox, 0, len(m.boxes))
|
|
for _, sb := range m.boxes {
|
|
result = append(result, sb.Sandbox)
|
|
}
|
|
return result
|
|
}
|
|
|
|
// Get returns a sandbox by ID.
|
|
func (m *Manager) Get(sandboxID string) (*models.Sandbox, error) {
|
|
sb, err := m.get(sandboxID)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return &sb.Sandbox, nil
|
|
}
|
|
|
|
func (m *Manager) get(sandboxID string) (*sandboxState, error) {
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
|
|
sb, ok := m.boxes[sandboxID]
|
|
if !ok {
|
|
return nil, fmt.Errorf("sandbox not found: %s", sandboxID)
|
|
}
|
|
return sb, nil
|
|
}
|
|
|
|
// StartTTLReaper starts a background goroutine that destroys sandboxes
|
|
// that have exceeded their TTL (timeout_sec of inactivity).
|
|
func (m *Manager) StartTTLReaper(ctx context.Context) {
|
|
go func() {
|
|
ticker := time.NewTicker(10 * time.Second)
|
|
defer ticker.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-m.stopCh:
|
|
return
|
|
case <-ticker.C:
|
|
m.reapExpired(ctx)
|
|
}
|
|
}
|
|
}()
|
|
}
|
|
|
|
func (m *Manager) reapExpired(ctx context.Context) {
|
|
m.mu.RLock()
|
|
var expired []string
|
|
now := time.Now()
|
|
for id, sb := range m.boxes {
|
|
if sb.TimeoutSec <= 0 {
|
|
continue
|
|
}
|
|
if sb.Status != models.StatusRunning && sb.Status != models.StatusPaused {
|
|
continue
|
|
}
|
|
if now.Sub(sb.LastActiveAt) > time.Duration(sb.TimeoutSec)*time.Second {
|
|
expired = append(expired, id)
|
|
}
|
|
}
|
|
m.mu.RUnlock()
|
|
|
|
for _, id := range expired {
|
|
slog.Info("TTL expired, destroying sandbox", "id", id)
|
|
if err := m.Destroy(ctx, id); err != nil {
|
|
slog.Warn("TTL reap failed", "id", id, "error", err)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Shutdown destroys all sandboxes and stops the TTL reaper.
|
|
func (m *Manager) Shutdown(ctx context.Context) {
|
|
close(m.stopCh)
|
|
|
|
m.mu.Lock()
|
|
ids := make([]string, 0, len(m.boxes))
|
|
for id := range m.boxes {
|
|
ids = append(ids, id)
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
for _, sbID := range ids {
|
|
slog.Info("shutdown: destroying sandbox", "id", sbID)
|
|
if err := m.Destroy(ctx, sbID); err != nil {
|
|
slog.Warn("shutdown destroy failed", "id", sbID, "error", err)
|
|
}
|
|
}
|
|
}
|