forked from wrenn/wrenn
Migrate the entire VM layer from Firecracker to Cloud Hypervisor (CH). CH provides native snapshot/restore via its HTTP API, eliminating the need for custom UFFD handling, memfile processing, and snapshot header management that Firecracker required. Key changes: - Remove fc.go, jailer.go (FC process management) - Remove internal/uffd/ package (userfaultfd lazy page loading) - Remove snapshot/header.go, mapping.go, memfile.go (FC snapshot format) - Add ch.go (CH HTTP API client over Unix socket) - Add process.go (CH process lifecycle with unshare+netns) - Add chversion.go (CH version detection) - Refactor sandbox manager: remove UFFD socket tracking, snapshot parent/diff chaining, FC-specific balloon logic; add crash watcher - Simplify snapshot/local.go to CH's native snapshot format - Update VM config: FirecrackerBin → VMMBin, new CH-specific fields - Update envdclient, devicemapper, network for CH compatibility
292 lines
9.9 KiB
Go
292 lines
9.9 KiB
Go
package api
|
|
|
|
import (
|
|
"context"
|
|
"log/slog"
|
|
"time"
|
|
|
|
"connectrpc.com/connect"
|
|
"github.com/jackc/pgx/v5/pgtype"
|
|
|
|
"git.omukk.dev/wrenn/wrenn/pkg/audit"
|
|
"git.omukk.dev/wrenn/wrenn/pkg/db"
|
|
"git.omukk.dev/wrenn/wrenn/pkg/id"
|
|
"git.omukk.dev/wrenn/wrenn/pkg/lifecycle"
|
|
pb "git.omukk.dev/wrenn/wrenn/proto/hostagent/gen"
|
|
)
|
|
|
|
// unreachableThreshold is how long a host can go without a heartbeat before
|
|
// it is considered unreachable (3 missed 30-second heartbeats).
|
|
const unreachableThreshold = 90 * time.Second
|
|
|
|
// transientGracePeriod is how long a sandbox is allowed to stay in a transient
|
|
// status (starting, resuming, pausing, stopping) before the monitor infers a
|
|
// final state. This prevents the monitor from racing against in-flight RPCs
|
|
// that may not have registered the sandbox on the host agent yet.
|
|
const transientGracePeriod = 2 * time.Minute
|
|
|
|
// HostMonitor runs on a fixed interval and performs two duties:
|
|
//
|
|
// 1. Passive check: marks hosts whose last_heartbeat_at is stale as
|
|
// "unreachable" and marks their active sandboxes as "missing".
|
|
//
|
|
// 2. Active reconciliation: for each online host, calls ListSandboxes and
|
|
// reconciles DB state against live host state — restoring "missing"
|
|
// sandboxes that are actually alive, and stopping orphaned ones.
|
|
type HostMonitor struct {
|
|
db *db.Queries
|
|
pool *lifecycle.HostClientPool
|
|
audit *audit.AuditLogger
|
|
interval time.Duration
|
|
}
|
|
|
|
// NewHostMonitor creates a HostMonitor.
|
|
func NewHostMonitor(queries *db.Queries, pool *lifecycle.HostClientPool, al *audit.AuditLogger, interval time.Duration) *HostMonitor {
|
|
return &HostMonitor{
|
|
db: queries,
|
|
pool: pool,
|
|
audit: al,
|
|
interval: interval,
|
|
}
|
|
}
|
|
|
|
// Start runs the monitor loop until the context is cancelled.
|
|
func (m *HostMonitor) Start(ctx context.Context) {
|
|
go func() {
|
|
ticker := time.NewTicker(m.interval)
|
|
defer ticker.Stop()
|
|
|
|
// Run immediately on startup so the CP doesn't wait one full interval
|
|
// before reconciling host and sandbox state.
|
|
m.run(ctx)
|
|
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-ticker.C:
|
|
m.run(ctx)
|
|
}
|
|
}
|
|
}()
|
|
}
|
|
|
|
func (m *HostMonitor) run(ctx context.Context) {
|
|
hosts, err := m.db.ListActiveHosts(ctx)
|
|
if err != nil {
|
|
slog.Warn("host monitor: failed to list hosts", "error", err)
|
|
return
|
|
}
|
|
|
|
for _, host := range hosts {
|
|
m.checkHost(ctx, host)
|
|
}
|
|
}
|
|
|
|
// ReconcileHost triggers immediate active reconciliation for a single host.
|
|
// Called when a host transitions from unreachable → online so sandboxes marked
|
|
// "missing" are resolved without waiting for the next monitor tick.
|
|
func (m *HostMonitor) ReconcileHost(ctx context.Context, hostID pgtype.UUID) {
|
|
host, err := m.db.GetHost(ctx, hostID)
|
|
if err != nil {
|
|
slog.Warn("host monitor: reconcile-on-connect: failed to get host", "error", err)
|
|
return
|
|
}
|
|
if host.Status != "online" {
|
|
return
|
|
}
|
|
m.checkHost(ctx, host)
|
|
}
|
|
|
|
func (m *HostMonitor) checkHost(ctx context.Context, host db.Host) {
|
|
// --- Passive phase: check heartbeat staleness ---
|
|
|
|
stale := !host.LastHeartbeatAt.Valid ||
|
|
time.Since(host.LastHeartbeatAt.Time) > unreachableThreshold
|
|
|
|
if stale && host.Status != "unreachable" {
|
|
slog.Info("host monitor: marking host unreachable", "host_id", id.FormatHostID(host.ID),
|
|
"last_heartbeat", host.LastHeartbeatAt.Time)
|
|
if err := m.db.MarkHostUnreachable(ctx, host.ID); err != nil {
|
|
slog.Warn("host monitor: failed to mark host unreachable", "host_id", id.FormatHostID(host.ID), "error", err)
|
|
}
|
|
if err := m.db.MarkSandboxesMissingByHost(ctx, host.ID); err != nil {
|
|
slog.Warn("host monitor: failed to mark sandboxes missing", "host_id", id.FormatHostID(host.ID), "error", err)
|
|
}
|
|
m.audit.LogHostMarkedDown(ctx, host.TeamID, host.ID)
|
|
return
|
|
}
|
|
|
|
// --- Active reconciliation: only for online hosts ---
|
|
|
|
if host.Status != "online" {
|
|
return
|
|
}
|
|
|
|
agent, err := m.pool.GetForHost(host)
|
|
if err != nil {
|
|
// Host has no address yet (e.g., just registered) — skip.
|
|
return
|
|
}
|
|
|
|
resp, err := agent.ListSandboxes(ctx, connect.NewRequest(&pb.ListSandboxesRequest{}))
|
|
if err != nil {
|
|
// RPC failure is a transient condition; the passive phase will catch it
|
|
// if heartbeats stop arriving.
|
|
slog.Debug("host monitor: ListSandboxes failed (transient)", "host_id", id.FormatHostID(host.ID), "error", err)
|
|
return
|
|
}
|
|
|
|
// Build set of sandbox IDs alive on the host.
|
|
// The host agent returns sandbox IDs as strings (formatted with prefix).
|
|
alive := make(map[string]struct{}, len(resp.Msg.Sandboxes))
|
|
for _, sb := range resp.Msg.Sandboxes {
|
|
alive[sb.SandboxId] = struct{}{}
|
|
}
|
|
|
|
autoPaused := make(map[string]struct{}, len(resp.Msg.AutoPausedSandboxIds))
|
|
for _, apID := range resp.Msg.AutoPausedSandboxIds {
|
|
autoPaused[apID] = struct{}{}
|
|
}
|
|
|
|
// --- Restore sandboxes that are "missing" in DB but alive on host ---
|
|
// This handles the case where CP marked them missing due to a transient
|
|
// heartbeat gap, but the host was actually fine.
|
|
|
|
missingSandboxes, err := m.db.ListSandboxesByHostAndStatus(ctx, db.ListSandboxesByHostAndStatusParams{
|
|
HostID: host.ID,
|
|
Column2: []string{"missing"},
|
|
})
|
|
if err != nil {
|
|
slog.Warn("host monitor: failed to list missing sandboxes", "host_id", id.FormatHostID(host.ID), "error", err)
|
|
} else {
|
|
var toRestore []pgtype.UUID
|
|
var toStop []pgtype.UUID
|
|
for _, sb := range missingSandboxes {
|
|
sbIDStr := id.FormatSandboxID(sb.ID)
|
|
if _, ok := alive[sbIDStr]; ok {
|
|
toRestore = append(toRestore, sb.ID)
|
|
} else {
|
|
toStop = append(toStop, sb.ID)
|
|
}
|
|
}
|
|
if len(toRestore) > 0 {
|
|
slog.Info("host monitor: restoring missing sandboxes", "host_id", id.FormatHostID(host.ID), "count", len(toRestore))
|
|
if err := m.db.BulkRestoreRunning(ctx, toRestore); err != nil {
|
|
slog.Warn("host monitor: failed to restore missing sandboxes", "host_id", id.FormatHostID(host.ID), "error", err)
|
|
}
|
|
}
|
|
if len(toStop) > 0 {
|
|
slog.Info("host monitor: stopping confirmed-dead missing sandboxes", "host_id", id.FormatHostID(host.ID), "count", len(toStop))
|
|
if err := m.db.BulkUpdateStatusByIDs(ctx, db.BulkUpdateStatusByIDsParams{
|
|
Column1: toStop,
|
|
Status: "stopped",
|
|
}); err != nil {
|
|
slog.Warn("host monitor: failed to stop missing sandboxes", "host_id", id.FormatHostID(host.ID), "error", err)
|
|
}
|
|
}
|
|
}
|
|
|
|
// --- Find running sandboxes in DB that are no longer alive on the host ---
|
|
|
|
runningSandboxes, err := m.db.ListSandboxesByHostAndStatus(ctx, db.ListSandboxesByHostAndStatusParams{
|
|
HostID: host.ID,
|
|
Column2: []string{"running"},
|
|
})
|
|
if err != nil {
|
|
slog.Warn("host monitor: failed to list running sandboxes", "host_id", id.FormatHostID(host.ID), "error", err)
|
|
return
|
|
}
|
|
|
|
var toPause, toStop []pgtype.UUID
|
|
sbTeamID := make(map[pgtype.UUID]pgtype.UUID, len(runningSandboxes))
|
|
for _, sb := range runningSandboxes {
|
|
sbIDStr := id.FormatSandboxID(sb.ID)
|
|
sbTeamID[sb.ID] = sb.TeamID
|
|
if _, ok := alive[sbIDStr]; ok {
|
|
continue
|
|
}
|
|
if _, ok := autoPaused[sbIDStr]; ok {
|
|
toPause = append(toPause, sb.ID)
|
|
} else {
|
|
toStop = append(toStop, sb.ID)
|
|
}
|
|
}
|
|
|
|
if len(toPause) > 0 {
|
|
slog.Info("host monitor: marking auto-paused sandboxes", "host_id", id.FormatHostID(host.ID), "count", len(toPause))
|
|
if err := m.db.BulkUpdateStatusByIDs(ctx, db.BulkUpdateStatusByIDsParams{
|
|
Column1: toPause,
|
|
Status: "paused",
|
|
}); err != nil {
|
|
slog.Warn("host monitor: failed to mark paused", "host_id", id.FormatHostID(host.ID), "error", err)
|
|
}
|
|
for _, sbID := range toPause {
|
|
m.audit.LogSandboxAutoPause(ctx, sbTeamID[sbID], sbID)
|
|
}
|
|
}
|
|
if len(toStop) > 0 {
|
|
slog.Info("host monitor: marking orphaned sandboxes stopped", "host_id", id.FormatHostID(host.ID), "count", len(toStop))
|
|
if err := m.db.BulkUpdateStatusByIDs(ctx, db.BulkUpdateStatusByIDsParams{
|
|
Column1: toStop,
|
|
Status: "stopped",
|
|
}); err != nil {
|
|
slog.Warn("host monitor: failed to mark stopped", "host_id", id.FormatHostID(host.ID), "error", err)
|
|
}
|
|
}
|
|
|
|
// --- Reconcile transient statuses (starting, resuming, pausing, stopping) ---
|
|
// These represent in-flight operations. If the sandbox is no longer alive on
|
|
// the host, infer the final state based on the transient status.
|
|
|
|
transientSandboxes, err := m.db.ListSandboxesByHostAndStatus(ctx, db.ListSandboxesByHostAndStatusParams{
|
|
HostID: host.ID,
|
|
Column2: []string{"starting", "resuming", "pausing", "stopping"},
|
|
})
|
|
if err != nil {
|
|
slog.Warn("host monitor: failed to list transient sandboxes", "host_id", id.FormatHostID(host.ID), "error", err)
|
|
return
|
|
}
|
|
|
|
for _, sb := range transientSandboxes {
|
|
sbIDStr := id.FormatSandboxID(sb.ID)
|
|
if _, ok := alive[sbIDStr]; ok {
|
|
// Sandbox is alive on host — the background goroutine should
|
|
// finalize the transition. For starting/resuming, if the sandbox
|
|
// is alive it means creation/resume succeeded.
|
|
if sb.Status == "starting" || sb.Status == "resuming" {
|
|
if _, err := m.db.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{
|
|
ID: sb.ID, Status: sb.Status, Status_2: "running",
|
|
}); err == nil {
|
|
slog.Info("host monitor: promoted transient sandbox to running", "sandbox_id", sbIDStr, "from", sb.Status)
|
|
}
|
|
}
|
|
continue
|
|
}
|
|
// Sandbox is not alive on host. If the transition is recent, give the
|
|
// in-flight RPC time to finish before declaring a final state.
|
|
if sb.LastUpdated.Valid && time.Since(sb.LastUpdated.Time) < transientGracePeriod {
|
|
slog.Debug("host monitor: transient sandbox still within grace period",
|
|
"sandbox_id", sbIDStr, "status", sb.Status,
|
|
"age", time.Since(sb.LastUpdated.Time).Round(time.Second))
|
|
continue
|
|
}
|
|
|
|
// Grace period expired — infer final state.
|
|
var finalStatus string
|
|
switch sb.Status {
|
|
case "starting", "resuming":
|
|
finalStatus = "error"
|
|
case "pausing":
|
|
finalStatus = "paused"
|
|
case "stopping":
|
|
finalStatus = "stopped"
|
|
}
|
|
if _, err := m.db.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{
|
|
ID: sb.ID, Status: sb.Status, Status_2: finalStatus,
|
|
}); err == nil {
|
|
slog.Info("host monitor: resolved transient sandbox", "sandbox_id", sbIDStr, "from", sb.Status, "to", finalStatus)
|
|
}
|
|
}
|
|
}
|