v0.2.0 (#50)

Co-authored-by: Tasnim Kabir Sadik <tksadik@omukk.dev> Reviewed-on: wrenn/wrenn#50
2026-05-24 21:10:37 +00:00
parent 4707f16c76
commit 05ddf62399
203 changed files with 15815 additions and 9344 deletions
--- a/internal/api/host_monitor.go
+++ b/internal/api/host_monitor.go
@ -2,6 +2,7 @@ package api

 import (
 	"context"
+	"errors"
 	"log/slog"
 	"time"

@ -15,10 +16,30 @@ import (
 	pb "git.omukk.dev/wrenn/wrenn/proto/hostagent/gen"
 )

+// errInferredTransientTimeout marks a state change that the reconciler
+// inferred after a transient (starting/resuming) sandbox failed to settle
+// within the grace period. Used as the err value on system audit calls so
+// the published event carries Outcome=error with a human-readable message.
+var errInferredTransientTimeout = errors.New("transient state did not settle within grace period")
+
 // unreachableThreshold is how long a host can go without a heartbeat before
 // it is considered unreachable (3 missed 30-second heartbeats).
 const unreachableThreshold = 90 * time.Second

+// transientGracePeriod is how long a sandbox is allowed to stay in a transient
+// status (starting, resuming, pausing, stopping) before the monitor infers a
+// final state. This prevents the monitor from racing against in-flight RPCs
+// that may not have registered the sandbox on the host agent yet.
+const transientGracePeriod = 2 * time.Minute
+
+// snapshotGracePeriod is the grace for a sandbox stuck in "snapshotting" while
+// the VM is still alive on the host. Snapshots dump guest RAM and flatten the
+// rootfs, which can run for minutes on large sandboxes, and the agent reports
+// the VM as alive throughout — so we must not race the in-flight operation.
+// It exceeds the background goroutine's 10-minute deadline, so reaching it
+// means the control plane crashed mid-snapshot and the sandbox needs recovery.
+const snapshotGracePeriod = 15 * time.Minute
+
 // HostMonitor runs on a fixed interval and performs two duties:
 //
 //  1. Passive check: marks hosts whose last_heartbeat_at is stale as
@ -77,6 +98,21 @@ func (m *HostMonitor) run(ctx context.Context) {
 	}
 }

+// ReconcileHost triggers immediate active reconciliation for a single host.
+// Called when a host transitions from unreachable → online so sandboxes marked
+// "missing" are resolved without waiting for the next monitor tick.
+func (m *HostMonitor) ReconcileHost(ctx context.Context, hostID pgtype.UUID) {
+	host, err := m.db.GetHost(ctx, hostID)
+	if err != nil {
+		slog.Warn("host monitor: reconcile-on-connect: failed to get host", "error", err)
+		return
+	}
+	if host.Status != "online" {
+		return
+	}
+	m.checkHost(ctx, host)
+}
+
 func (m *HostMonitor) checkHost(ctx context.Context, host db.Host) {
 	// --- Passive phase: check heartbeat staleness ---

@ -116,21 +152,29 @@ func (m *HostMonitor) checkHost(ctx context.Context, host db.Host) {
 		return
 	}

-	// Build set of sandbox IDs alive on the host.
-	// The host agent returns sandbox IDs as strings (formatted with prefix).
-	alive := make(map[string]struct{}, len(resp.Msg.Sandboxes))
+	// Build map of sandbox ID -> reported status. Transient statuses
+	// (pausing/resuming/starting/stopping) are coerced to a presence-only
+	// entry: ListSandboxes can observe the in-memory status mid-transition
+	// (Pause flips the status under m.mu while List holds m.mu.RLock), and
+	// writing those transient labels into the DB would force the transient
+	// reconciliation phase to wait the full grace period before resolving.
+	// Recording the presence keeps "missing → restore" and "running →
+	// orphan-stop" logic correct without overwriting with stale labels;
+	// the next monitor tick reads the settled status.
+	aliveStatus := make(map[string]string, len(resp.Msg.Sandboxes))
 	for _, sb := range resp.Msg.Sandboxes {
-		alive[sb.SandboxId] = struct{}{}
-	}
-
-	autoPaused := make(map[string]struct{}, len(resp.Msg.AutoPausedSandboxIds))
-	for _, apID := range resp.Msg.AutoPausedSandboxIds {
-		autoPaused[apID] = struct{}{}
+		status := sb.Status
+		switch status {
+		case "pausing", "resuming", "starting", "stopping":
+			status = ""
+		}
+		aliveStatus[sb.SandboxId] = status
 	}

 	// --- Restore sandboxes that are "missing" in DB but alive on host ---
-	// This handles the case where CP marked them missing due to a transient
-	// heartbeat gap, but the host was actually fine.
+	// Handles transient heartbeat gaps where the host was actually fine. The
+	// reported status must be honored: a sandbox the agent paused while CP
+	// was disconnected must not be silently promoted back to running.

 	missingSandboxes, err := m.db.ListSandboxesByHostAndStatus(ctx, db.ListSandboxesByHostAndStatusParams{
 		HostID:  host.ID,
@ -139,34 +183,65 @@ func (m *HostMonitor) checkHost(ctx context.Context, host db.Host) {
 	if err != nil {
 		slog.Warn("host monitor: failed to list missing sandboxes", "host_id", id.FormatHostID(host.ID), "error", err)
 	} else {
-		var toRestore []pgtype.UUID
-		var toStop []pgtype.UUID
+		restoreByStatus := make(map[string][]db.Sandbox)
+		var toStop []db.Sandbox
 		for _, sb := range missingSandboxes {
 			sbIDStr := id.FormatSandboxID(sb.ID)
-			if _, ok := alive[sbIDStr]; ok {
-				toRestore = append(toRestore, sb.ID)
-			} else {
-				toStop = append(toStop, sb.ID)
+			status, ok := aliveStatus[sbIDStr]
+			if !ok {
+				toStop = append(toStop, sb)
+				continue
 			}
+			if status == "" {
+				continue
+			}
+			restoreByStatus[status] = append(restoreByStatus[status], sb)
 		}
-		if len(toRestore) > 0 {
-			slog.Info("host monitor: restoring missing sandboxes", "host_id", id.FormatHostID(host.ID), "count", len(toRestore))
-			if err := m.db.BulkRestoreRunning(ctx, toRestore); err != nil {
-				slog.Warn("host monitor: failed to restore missing sandboxes", "host_id", id.FormatHostID(host.ID), "error", err)
+		for status, sbs := range restoreByStatus {
+			ids := make([]pgtype.UUID, len(sbs))
+			for i, sb := range sbs {
+				ids[i] = sb.ID
+			}
+			slog.Info("host monitor: restoring missing sandboxes", "host_id", id.FormatHostID(host.ID), "status", status, "count", len(ids))
+			if err := m.db.BulkRestoreMissingToStatus(ctx, db.BulkRestoreMissingToStatusParams{
+				Column1: ids,
+				Status:  status,
+			}); err != nil {
+				slog.Warn("host monitor: failed to restore missing sandboxes", "host_id", id.FormatHostID(host.ID), "status", status, "error", err)
+				continue
+			}
+			// Only restore→paused emits a notification (per design: running restore is silent).
+			if status == "paused" {
+				for _, sb := range sbs {
+					m.audit.LogSandboxAutoPause(ctx, sb.TeamID, sb.ID, "restored_after_host_recovery", nil)
+				}
 			}
 		}
 		if len(toStop) > 0 {
-			slog.Info("host monitor: stopping confirmed-dead missing sandboxes", "host_id", id.FormatHostID(host.ID), "count", len(toStop))
+			ids := make([]pgtype.UUID, len(toStop))
+			for i, sb := range toStop {
+				ids[i] = sb.ID
+			}
+			slog.Info("host monitor: stopping confirmed-dead missing sandboxes", "host_id", id.FormatHostID(host.ID), "count", len(ids))
 			if err := m.db.BulkUpdateStatusByIDs(ctx, db.BulkUpdateStatusByIDsParams{
-				Column1: toStop,
+				Column1: ids,
 				Status:  "stopped",
 			}); err != nil {
 				slog.Warn("host monitor: failed to stop missing sandboxes", "host_id", id.FormatHostID(host.ID), "error", err)
+			} else {
+				for _, sb := range toStop {
+					m.audit.LogSandboxDestroySystem(ctx, sb.TeamID, sb.ID, "orphaned", nil)
+				}
 			}
 		}
 	}

-	// --- Find running sandboxes in DB that are no longer alive on the host ---
+	// --- Reconcile running sandboxes in DB against live host state ---
+	// Three cases per DB-running row:
+	//   absent on host          -> stopped
+	//   present and running     -> no change
+	//   present but paused/etc. -> sync DB to reported status (catches the
+	//                              shutdown-pause notify failure case)

 	runningSandboxes, err := m.db.ListSandboxesByHostAndStatus(ctx, db.ListSandboxesByHostAndStatusParams{
 		HostID:  host.ID,
@ -177,40 +252,196 @@ func (m *HostMonitor) checkHost(ctx context.Context, host db.Host) {
 		return
 	}

-	var toPause, toStop []pgtype.UUID
-	sbTeamID := make(map[pgtype.UUID]pgtype.UUID, len(runningSandboxes))
+	var toStop []db.Sandbox
+	syncByStatus := make(map[string][]db.Sandbox)
 	for _, sb := range runningSandboxes {
 		sbIDStr := id.FormatSandboxID(sb.ID)
-		sbTeamID[sb.ID] = sb.TeamID
-		if _, ok := alive[sbIDStr]; ok {
+		status, ok := aliveStatus[sbIDStr]
+		if !ok {
+			toStop = append(toStop, sb)
 			continue
 		}
-		if _, ok := autoPaused[sbIDStr]; ok {
-			toPause = append(toPause, sb.ID)
-		} else {
-			toStop = append(toStop, sb.ID)
+		if status == "running" || status == "" {
+			continue
 		}
+		syncByStatus[status] = append(syncByStatus[status], sb)
 	}

-	if len(toPause) > 0 {
-		slog.Info("host monitor: marking auto-paused sandboxes", "host_id", id.FormatHostID(host.ID), "count", len(toPause))
-		if err := m.db.BulkUpdateStatusByIDs(ctx, db.BulkUpdateStatusByIDsParams{
-			Column1: toPause,
-			Status:  "paused",
-		}); err != nil {
-			slog.Warn("host monitor: failed to mark paused", "host_id", id.FormatHostID(host.ID), "error", err)
-		}
-		for _, sbID := range toPause {
-			m.audit.LogSandboxAutoPause(ctx, sbTeamID[sbID], sbID)
-		}
-	}
 	if len(toStop) > 0 {
-		slog.Info("host monitor: marking orphaned sandboxes stopped", "host_id", id.FormatHostID(host.ID), "count", len(toStop))
+		ids := make([]pgtype.UUID, len(toStop))
+		for i, sb := range toStop {
+			ids[i] = sb.ID
+		}
+		slog.Info("host monitor: marking orphaned sandboxes stopped", "host_id", id.FormatHostID(host.ID), "count", len(ids))
 		if err := m.db.BulkUpdateStatusByIDs(ctx, db.BulkUpdateStatusByIDsParams{
-			Column1: toStop,
+			Column1: ids,
 			Status:  "stopped",
 		}); err != nil {
 			slog.Warn("host monitor: failed to mark stopped", "host_id", id.FormatHostID(host.ID), "error", err)
+		} else {
+			for _, sb := range toStop {
+				m.audit.LogSandboxDestroySystem(ctx, sb.TeamID, sb.ID, "orphaned", nil)
+			}
+		}
+	}
+	for status, sbs := range syncByStatus {
+		ids := make([]pgtype.UUID, len(sbs))
+		for i, sb := range sbs {
+			ids[i] = sb.ID
+		}
+		slog.Info("host monitor: syncing running→reported status", "host_id", id.FormatHostID(host.ID), "status", status, "count", len(ids))
+		if err := m.db.BulkUpdateStatusByIDs(ctx, db.BulkUpdateStatusByIDsParams{
+			Column1: ids,
+			Status:  status,
+		}); err != nil {
+			slog.Warn("host monitor: failed to sync running sandboxes", "host_id", id.FormatHostID(host.ID), "status", status, "error", err)
+			continue
+		}
+		if status == "paused" {
+			for _, sb := range sbs {
+				m.audit.LogSandboxAutoPause(ctx, sb.TeamID, sb.ID, "host_state_sync", nil)
+			}
+		}
+	}
+
+	// --- Reconcile DB-stopped + agent-paused zombies ---
+	// A sandbox the agent reports as 'paused' but DB has as 'stopped' is an
+	// orphan from a previous bug where a successful pause's auto_paused
+	// callback was lost (e.g. CP unreachable during agent shutdown). With the
+	// agent-side fix (RestorePausedSandboxes), the snapshot survives across
+	// agent restarts and surfaces here. Authoritative direction: DB wins
+	// (user already saw 'stopped' and may have stopped tracking it).
+	// Issue Destroy so the on-disk snapshot dir is removed and the agent's
+	// slot reservation released.
+	//
+	// Gate: only run the DB query if the agent reports at least one paused
+	// sandbox. Otherwise we'd fetch every historically-stopped sandbox on
+	// this host every monitor tick — unbounded growth over a host's lifetime.
+	hasPaused := false
+	for _, status := range aliveStatus {
+		if status == "paused" {
+			hasPaused = true
+			break
+		}
+	}
+	if hasPaused {
+		stoppedSandboxes, err := m.db.ListSandboxesByHostAndStatus(ctx, db.ListSandboxesByHostAndStatusParams{
+			HostID:  host.ID,
+			Column2: []string{"stopped"},
+		})
+		if err != nil {
+			slog.Warn("host monitor: failed to list stopped sandboxes", "host_id", id.FormatHostID(host.ID), "error", err)
+		} else {
+			for _, sb := range stoppedSandboxes {
+				sbIDStr := id.FormatSandboxID(sb.ID)
+				status, ok := aliveStatus[sbIDStr]
+				if !ok || status != "paused" {
+					continue
+				}
+				slog.Info("host monitor: destroying DB-stopped agent-paused zombie",
+					"host_id", id.FormatHostID(host.ID), "sandbox_id", sbIDStr)
+				if _, err := agent.DestroySandbox(ctx, connect.NewRequest(&pb.DestroySandboxRequest{
+					SandboxId: sbIDStr,
+				})); err != nil && connect.CodeOf(err) != connect.CodeNotFound {
+					slog.Warn("host monitor: zombie destroy failed",
+						"sandbox_id", sbIDStr, "error", err)
+					continue
+				}
+				m.audit.LogSandboxDestroySystem(ctx, sb.TeamID, sb.ID, "paused_zombie_cleanup", nil)
+			}
+		}
+	}
+
+	// --- Reconcile transient statuses (starting, resuming, pausing, stopping) ---
+	// These represent in-flight operations. If the sandbox is no longer alive on
+	// the host, infer the final state based on the transient status.
+
+	transientSandboxes, err := m.db.ListSandboxesByHostAndStatus(ctx, db.ListSandboxesByHostAndStatusParams{
+		HostID:  host.ID,
+		Column2: []string{"starting", "resuming", "pausing", "stopping", "snapshotting"},
+	})
+	if err != nil {
+		slog.Warn("host monitor: failed to list transient sandboxes", "host_id", id.FormatHostID(host.ID), "error", err)
+		return
+	}
+
+	for _, sb := range transientSandboxes {
+		sbIDStr := id.FormatSandboxID(sb.ID)
+		if agentStatus, ok := aliveStatus[sbIDStr]; ok {
+			// Sandbox is alive on host — the background goroutine should
+			// finalize the transition. For starting/resuming, if the sandbox
+			// is alive it means creation/resume succeeded.
+			if sb.Status == "starting" || sb.Status == "resuming" {
+				if _, err := m.db.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{
+					ID: sb.ID, Status: sb.Status, Status_2: "running",
+				}); err == nil {
+					slog.Info("host monitor: promoted transient sandbox to running", "sandbox_id", sbIDStr, "from", sb.Status)
+				}
+			}
+			// A snapshot keeps the source sandbox alive throughout, so an alive
+			// sandbox does NOT mean the snapshot finished. Only recover it once
+			// it has been stuck past the snapshot grace period (i.e. the CP
+			// crashed mid-op). Recover to the sandbox's actual host-side status:
+			// a running sandbox is snapshotted live and stays running, but a
+			// paused sandbox is snapshotted from disk and must return to paused.
+			if sb.Status == "snapshotting" &&
+				sb.LastUpdated.Valid && time.Since(sb.LastUpdated.Time) >= snapshotGracePeriod {
+				recoverTo := agentStatus
+				if recoverTo != "running" && recoverTo != "paused" {
+					// Coerced/unknown agent label — default to running.
+					recoverTo = "running"
+				}
+				if _, err := m.db.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{
+					ID: sb.ID, Status: "snapshotting", Status_2: recoverTo,
+				}); err == nil {
+					slog.Info("host monitor: recovered stuck snapshotting sandbox", "sandbox_id", sbIDStr, "to", recoverTo)
+					m.audit.LogSnapshotCreateSystem(ctx, sb.TeamID, sb.ID, "snapshot_recovered", nil)
+				}
+			}
+			continue
+		}
+		// Sandbox is not alive on host. If the transition is recent, give the
+		// in-flight RPC time to finish before declaring a final state.
+		if sb.LastUpdated.Valid && time.Since(sb.LastUpdated.Time) < transientGracePeriod {
+			slog.Debug("host monitor: transient sandbox still within grace period",
+				"sandbox_id", sbIDStr, "status", sb.Status,
+				"age", time.Since(sb.LastUpdated.Time).Round(time.Second))
+			continue
+		}
+
+		// Grace period expired — infer final state.
+		var finalStatus string
+		switch sb.Status {
+		case "starting", "resuming":
+			finalStatus = "error"
+		case "pausing":
+			finalStatus = "paused"
+		case "stopping":
+			finalStatus = "stopped"
+		case "snapshotting":
+			// VM is gone but DB says snapshotting → the snapshot died with the VM.
+			finalStatus = "error"
+		}
+		fromStatus := sb.Status
+		if _, err := m.db.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{
+			ID: sb.ID, Status: fromStatus, Status_2: finalStatus,
+		}); err == nil {
+			slog.Info("host monitor: resolved transient sandbox", "sandbox_id", sbIDStr, "from", fromStatus, "to", finalStatus)
+			inferredErr := errInferredTransientTimeout
+			switch fromStatus {
+			case "starting":
+				m.audit.LogSandboxCreateSystem(ctx, sb.TeamID, sb.ID, "transient_timeout", inferredErr)
+			case "resuming":
+				m.audit.LogSandboxResumeSystem(ctx, sb.TeamID, sb.ID, "transient_timeout", inferredErr)
+			case "pausing":
+				// Pause assumed to have succeeded host-side; emit success with inferred metadata.
+				m.audit.LogSandboxAutoPause(ctx, sb.TeamID, sb.ID, "transient_timeout_inferred", nil)
+			case "snapshotting":
+				// VM gone mid-snapshot; the sandbox is errored.
+				m.audit.LogSnapshotCreateSystem(ctx, sb.TeamID, sb.ID, "transient_timeout", inferredErr)
+			case "stopping":
+				m.audit.LogSandboxDestroySystem(ctx, sb.TeamID, sb.ID, "transient_timeout_inferred", nil)
+			}
 		}
 	}
 }