1
0
forked from wrenn/wrenn
Co-authored-by: Tasnim Kabir Sadik <tksadik@omukk.dev>

Reviewed-on: wrenn/wrenn#50
This commit is contained in:
2026-05-24 21:10:37 +00:00
parent 4707f16c76
commit 05ddf62399
203 changed files with 15815 additions and 9344 deletions

View File

@ -24,6 +24,7 @@ import (
"git.omukk.dev/wrenn/wrenn/internal/layout"
"git.omukk.dev/wrenn/wrenn/internal/network"
"git.omukk.dev/wrenn/wrenn/internal/sandbox"
"git.omukk.dev/wrenn/wrenn/internal/vm"
"git.omukk.dev/wrenn/wrenn/pkg/auth"
"git.omukk.dev/wrenn/wrenn/pkg/logging"
"git.omukk.dev/wrenn/wrenn/proto/hostagent/gen/hostagentv1connect"
@ -63,8 +64,12 @@ func main() {
os.Exit(1)
}
// Clean up stale resources from a previous crash.
// Clean up stale resources from a previous crash. Order matters:
// kill stale CH processes first — they hold dm-snapshot devices open and
// would otherwise cause "Device or resource busy" on dmsetup remove.
vm.CleanupStaleProcesses()
devicemapper.CleanupStaleDevices()
devicemapper.LogLoopState()
network.CleanupStaleNamespaces()
listenAddr := envOrDefault("WRENN_HOST_LISTEN_ADDR", ":50051")
@ -126,27 +131,45 @@ func main() {
}
slog.Info("resolved kernel", "version", kernelVersion, "path", kernelPath)
// Detect firecracker version.
fcBin := envOrDefault("WRENN_FIRECRACKER_BIN", "/usr/local/bin/firecracker")
fcVersion, err := sandbox.DetectFirecrackerVersion(fcBin)
// Detect cloud-hypervisor version.
chBin := envOrDefault("WRENN_CH_BIN", "/usr/local/bin/cloud-hypervisor")
chVersion, err := sandbox.DetectCHVersion(chBin)
if err != nil {
slog.Error("failed to detect firecracker version", "error", err)
slog.Error("failed to detect cloud-hypervisor version", "error", err)
os.Exit(1)
}
slog.Info("resolved firecracker", "version", fcVersion, "path", fcBin)
slog.Info("resolved cloud-hypervisor", "version", chVersion, "path", chBin)
cfg := sandbox.Config{
WrennDir: rootDir,
DefaultRootfsSizeMB: defaultRootfsSizeMB,
KernelPath: kernelPath,
KernelVersion: kernelVersion,
FirecrackerBin: fcBin,
FirecrackerVersion: fcVersion,
VMMBin: chBin,
VMMVersion: chVersion,
AgentVersion: version,
}
// Remove any *.staging-* / *.trash-* directories left behind by a
// previous Pause that crashed before completing the atomic swap. Must
// run before any Resume so we don't race with a sandbox restoration.
sandbox.CleanupOrphanPauseDirs(rootDir)
mgr := sandbox.New(cfg)
// Set up lifecycle event callback sender so autonomous events
// (auto-pause, auto-destroy) are pushed to the CP proactively.
cb := hostagent.NewCallbackSender(cpURL, credsFile, creds.HostID)
mgr.SetEventSender(hostagent.NewEventSender(cb))
// Restore paused sandboxes from disk so ListSandboxes reports them as
// 'paused' immediately. Without this, the CP's HostMonitor would mark
// every paused-on-disk sandbox 'stopped' via the missing→stopped
// reconcile path on the first ListSandboxes after agent restart.
// Must run before HTTP server starts serving (an early Create would
// race the slot reservation).
mgr.RestorePausedSandboxes()
mgr.StartTTLReaper(ctx)
// httpServer is declared here so the shutdown func can reference it.
@ -190,10 +213,22 @@ func main() {
shutdownOnce.Do(func() {
slog.Info("shutting down", "reason", reason)
cancel()
shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 30*time.Second)
// Shutdown pauses every running sandbox in parallel (PauseAll uses
// a worker pool). Per-sandbox Pause can take 1030s (memory loader
// wait + ch.snapshot of guest RAM). 5 minutes is enough headroom for
// a busy host while still bounded so a wedged sandbox can't keep the
// process alive indefinitely — a second signal force-exits anyway.
shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 5*time.Minute)
defer shutdownCancel()
// Order matters: mgr.Shutdown FIRST so it runs to completion
// before httpServer.Shutdown unblocks main's Serve and lets the
// process exit. mgr.Shutdown internally flips a draining flag
// that rejects new Create/Resume RPCs with Unavailable so any
// in-flight HTTP handlers can't add sandboxes after PauseAll
// snapshotted state. User-initiated Pauses already running are
// awaited by PauseAll/Destroy's lifecycleMu serialization.
mgr.Shutdown(shutdownCtx)
sandbox.ShrinkMinimalImage(rootDir)
sandbox.ShrinkSystemImages(rootDir)
if err := httpServer.Shutdown(shutdownCtx); err != nil {
slog.Error("http server shutdown error", "error", err)
}
@ -226,8 +261,9 @@ func main() {
func() {
doShutdown("host deleted from CP")
},
// onCredsRefreshed: hot-swap the TLS certificate after a JWT refresh.
// onCredsRefreshed: hot-swap the TLS certificate and update callback JWT.
func(tf *hostagent.TokenFile) {
cb.UpdateJWT(tf.JWT)
if tf.CertPEM == "" || tf.KeyPEM == "" {
return
}
@ -239,12 +275,16 @@ func main() {
},
)
// Graceful shutdown on SIGINT/SIGTERM.
// Graceful shutdown on SIGINT/SIGTERM. A second signal force-exits
// so the operator can always kill the process if shutdown hangs.
sigCh := make(chan os.Signal, 1)
signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM)
go func() {
sig := <-sigCh
doShutdown("signal: " + sig.String())
go doShutdown("signal: " + sig.String())
sig = <-sigCh
slog.Error("received second signal, force exiting", "signal", sig.String())
os.Exit(1)
}()
slog.Info("host agent starting", "addr", listenAddr, "host_id", creds.HostID, "version", version, "commit", commit)
@ -286,7 +326,7 @@ func checkPrivileges() error {
name string
}{
{1, "CAP_DAC_OVERRIDE"}, // /dev/loop*, /dev/mapper/*, /dev/net/tun
{5, "CAP_KILL"}, // SIGTERM/SIGKILL to Firecracker processes
{5, "CAP_KILL"}, // SIGTERM/SIGKILL to cloud-hypervisor processes
{12, "CAP_NET_ADMIN"}, // netlink, iptables, routing, TAP/veth
{13, "CAP_NET_RAW"}, // raw sockets (iptables)
{19, "CAP_SYS_PTRACE"}, // reading /proc/self/ns/net (netns.Get)