Add host agent RPC server with sandbox lifecycle management

Implement the host agent as a Connect RPC server that orchestrates
sandbox creation, destruction, pause/resume, and command execution.
Includes sandbox manager with TTL-based reaper, network slot allocator,
rootfs cloning, hostagent proto definition with generated stubs, and
test/debug scripts. Fix Firecracker process lifetime bug where VM was
tied to HTTP request context instead of background context.
This commit is contained in:
2026-03-10 03:54:53 +06:00
parent c31ce90306
commit 6f0c365d44
24 changed files with 6236 additions and 121 deletions

View File

@ -2,26 +2,16 @@ package main
import (
"context"
"fmt"
"log/slog"
"net/http"
"os"
"os/exec"
"os/signal"
"path/filepath"
"syscall"
"time"
"git.omukk.dev/wrenn/sandbox/internal/envdclient"
"git.omukk.dev/wrenn/sandbox/internal/network"
"git.omukk.dev/wrenn/sandbox/internal/vm"
)
const (
kernelPath = "/var/lib/wrenn/kernels/vmlinux"
baseRootfs = "/var/lib/wrenn/sandboxes/rootfs.ext4"
sandboxesDir = "/var/lib/wrenn/sandboxes"
sandboxID = "sb-demo0001"
slotIndex = 1
"git.omukk.dev/wrenn/sandbox/internal/hostagent"
"git.omukk.dev/wrenn/sandbox/internal/sandbox"
"git.omukk.dev/wrenn/sandbox/proto/hostagent/gen/hostagentv1connect"
)
func main() {
@ -39,112 +29,66 @@ func main() {
slog.Warn("failed to enable ip_forward", "error", err)
}
listenAddr := envOrDefault("AGENT_LISTEN_ADDR", ":50051")
kernelPath := envOrDefault("AGENT_KERNEL_PATH", "/var/lib/wrenn/kernels/vmlinux")
imagesPath := envOrDefault("AGENT_IMAGES_PATH", "/var/lib/wrenn/images")
sandboxesPath := envOrDefault("AGENT_SANDBOXES_PATH", "/var/lib/wrenn/sandboxes")
cfg := sandbox.Config{
KernelPath: kernelPath,
ImagesDir: imagesPath,
SandboxesDir: sandboxesPath,
}
mgr := sandbox.New(cfg)
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
// Handle signals for clean shutdown.
mgr.StartTTLReaper(ctx)
srv := hostagent.NewServer(mgr)
path, handler := hostagentv1connect.NewHostAgentServiceHandler(srv)
mux := http.NewServeMux()
mux.Handle(path, handler)
httpServer := &http.Server{
Addr: listenAddr,
Handler: mux,
}
// Graceful shutdown on signal.
sigCh := make(chan os.Signal, 1)
signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM)
go func() {
sig := <-sigCh
slog.Info("received signal, shutting down", "signal", sig)
cancel()
shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 30*time.Second)
defer shutdownCancel()
mgr.Shutdown(shutdownCtx)
if err := httpServer.Shutdown(shutdownCtx); err != nil {
slog.Error("http server shutdown error", "error", err)
}
}()
if err := run(ctx); err != nil {
slog.Error("fatal error", "error", err)
slog.Info("host agent starting", "addr", listenAddr)
if err := httpServer.ListenAndServe(); err != nil && err != http.ErrServerClosed {
slog.Error("http server error", "error", err)
os.Exit(1)
}
slog.Info("host agent stopped")
}
func run(ctx context.Context) error {
// Step 1: Clone rootfs for this sandbox.
sandboxRootfs := filepath.Join(sandboxesDir, fmt.Sprintf("%s-rootfs.ext4", sandboxID))
slog.Info("cloning rootfs", "src", baseRootfs, "dst", sandboxRootfs)
if err := cloneRootfs(baseRootfs, sandboxRootfs); err != nil {
return fmt.Errorf("clone rootfs: %w", err)
func envOrDefault(key, def string) string {
if v := os.Getenv(key); v != "" {
return v
}
defer os.Remove(sandboxRootfs)
// Step 2: Set up network.
slot := network.NewSlot(slotIndex)
slog.Info("setting up network", "slot", slotIndex)
if err := network.CreateNetwork(slot); err != nil {
return fmt.Errorf("create network: %w", err)
}
defer func() {
slog.Info("tearing down network")
network.RemoveNetwork(slot)
}()
// Step 3: Boot the VM.
mgr := vm.NewManager()
cfg := vm.VMConfig{
SandboxID: sandboxID,
KernelPath: kernelPath,
RootfsPath: sandboxRootfs,
VCPUs: 1,
MemoryMB: 512,
NetworkNamespace: slot.NamespaceID,
TapDevice: slot.TapName,
TapMAC: slot.TapMAC,
GuestIP: slot.GuestIP,
GatewayIP: slot.TapIP,
NetMask: slot.GuestNetMask,
}
vmInstance, err := mgr.Create(ctx, cfg)
if err != nil {
return fmt.Errorf("create VM: %w", err)
}
_ = vmInstance
defer func() {
slog.Info("destroying VM")
mgr.Destroy(context.Background(), sandboxID)
}()
// Step 4: Wait for envd to be ready.
client := envdclient.New(slot.HostIP.String())
waitCtx, waitCancel := context.WithTimeout(ctx, 30*time.Second)
defer waitCancel()
if err := client.WaitUntilReady(waitCtx); err != nil {
return fmt.Errorf("wait for envd: %w", err)
}
// Step 5: Run "echo hello" inside the sandbox.
slog.Info("executing command", "cmd", "echo hello")
result, err := client.Exec(ctx, "/bin/sh", "-c", "echo hello")
if err != nil {
return fmt.Errorf("exec: %w", err)
}
fmt.Printf("\n=== Command Output ===\n")
fmt.Printf("stdout: %s", string(result.Stdout))
if len(result.Stderr) > 0 {
fmt.Printf("stderr: %s", string(result.Stderr))
}
fmt.Printf("exit code: %d\n", result.ExitCode)
fmt.Printf("======================\n\n")
// Step 6: Clean shutdown.
slog.Info("demo complete, cleaning up")
return nil
return def
}
// cloneRootfs creates a copy-on-write clone of the base rootfs image.
// Uses reflink if supported by the filesystem, falls back to regular copy.
func cloneRootfs(src, dst string) error {
// Try reflink first (instant, CoW).
cmd := exec.Command("cp", "--reflink=auto", src, dst)
if err := cmd.Run(); err != nil {
return fmt.Errorf("cp --reflink=auto: %w", err)
}
return nil
}