wrenn-releases/internal/api/host_monitor.go

package api

import (
	"context"
	"errors"
	"log/slog"
	"time"

	"connectrpc.com/connect"
	"github.com/jackc/pgx/v5/pgtype"

	"git.omukk.dev/wrenn/wrenn/pkg/audit"
	"git.omukk.dev/wrenn/wrenn/pkg/db"
	"git.omukk.dev/wrenn/wrenn/pkg/id"
	"git.omukk.dev/wrenn/wrenn/pkg/lifecycle"
	pb "git.omukk.dev/wrenn/wrenn/proto/hostagent/gen"
)

// errInferredTransientTimeout marks a state change that the reconciler
// inferred after a transient (starting/resuming) sandbox failed to settle
// within the grace period. Used as the err value on system audit calls so
// the published event carries Outcome=error with a human-readable message.
var errInferredTransientTimeout = errors.New("transient state did not settle within grace period")

// unreachableThreshold is how long a host can go without a heartbeat before
// it is considered unreachable (3 missed 30-second heartbeats).
const unreachableThreshold = 90 * time.Second

// transientGracePeriod is how long a sandbox is allowed to stay in a transient
// status (starting, resuming, pausing, stopping) before the monitor infers a
// final state. This prevents the monitor from racing against in-flight RPCs
// that may not have registered the sandbox on the host agent yet.
const transientGracePeriod = 2 * time.Minute

// snapshotGracePeriod is the grace for a sandbox stuck in "snapshotting" while
// the VM is still alive on the host. Snapshots dump guest RAM and flatten the
// rootfs, which can run for minutes on large sandboxes, and the agent reports
// the VM as alive throughout — so we must not race the in-flight operation.
// It exceeds the background goroutine's 10-minute deadline, so reaching it
// means the control plane crashed mid-snapshot and the sandbox needs recovery.
const snapshotGracePeriod = 15 * time.Minute

// HostMonitor runs on a fixed interval and performs two duties:
//
//  1. Passive check: marks hosts whose last_heartbeat_at is stale as
//     "unreachable" and marks their active sandboxes as "missing".
//
//  2. Active reconciliation: for each online host, calls ListSandboxes and
//     reconciles DB state against live host state — restoring "missing"
//     sandboxes that are actually alive, and stopping orphaned ones.
type HostMonitor struct {
	db       *db.Queries
	pool     *lifecycle.HostClientPool
	audit    *audit.AuditLogger
	interval time.Duration
}

// NewHostMonitor creates a HostMonitor.
func NewHostMonitor(queries *db.Queries, pool *lifecycle.HostClientPool, al *audit.AuditLogger, interval time.Duration) *HostMonitor {
	return &HostMonitor{
		db:       queries,
		pool:     pool,
		audit:    al,
		interval: interval,
	}
}

// Start runs the monitor loop until the context is cancelled.
func (m *HostMonitor) Start(ctx context.Context) {
	go func() {
		ticker := time.NewTicker(m.interval)
		defer ticker.Stop()

		// Run immediately on startup so the CP doesn't wait one full interval
		// before reconciling host and sandbox state.
		m.run(ctx)

		for {
			select {
			case <-ctx.Done():
				return
			case <-ticker.C:
				m.run(ctx)
			}
		}
	}()
}

func (m *HostMonitor) run(ctx context.Context) {
	hosts, err := m.db.ListActiveHosts(ctx)
	if err != nil {
		slog.Warn("host monitor: failed to list hosts", "error", err)
		return
	}

	for _, host := range hosts {
		m.checkHost(ctx, host)
	}
}

// ReconcileHost triggers immediate active reconciliation for a single host.
// Called when a host transitions from unreachable → online so sandboxes marked
// "missing" are resolved without waiting for the next monitor tick.
func (m *HostMonitor) ReconcileHost(ctx context.Context, hostID pgtype.UUID) {
	host, err := m.db.GetHost(ctx, hostID)
	if err != nil {
		slog.Warn("host monitor: reconcile-on-connect: failed to get host", "error", err)
		return
	}
	if host.Status != "online" {
		return
	}
	m.checkHost(ctx, host)
}

func (m *HostMonitor) checkHost(ctx context.Context, host db.Host) {
	// --- Passive phase: check heartbeat staleness ---

	stale := !host.LastHeartbeatAt.Valid ||
		time.Since(host.LastHeartbeatAt.Time) > unreachableThreshold

	if stale && host.Status != "unreachable" {
		slog.Info("host monitor: marking host unreachable", "host_id", id.FormatHostID(host.ID),
			"last_heartbeat", host.LastHeartbeatAt.Time)
		if err := m.db.MarkHostUnreachable(ctx, host.ID); err != nil {
			slog.Warn("host monitor: failed to mark host unreachable", "host_id", id.FormatHostID(host.ID), "error", err)
		}
		if err := m.db.MarkSandboxesMissingByHost(ctx, host.ID); err != nil {
			slog.Warn("host monitor: failed to mark sandboxes missing", "host_id", id.FormatHostID(host.ID), "error", err)
		}
		m.audit.LogHostMarkedDown(ctx, host.TeamID, host.ID)
		return
	}

	// --- Active reconciliation: only for online hosts ---

	if host.Status != "online" {
		return
	}

	agent, err := m.pool.GetForHost(host)
	if err != nil {
		// Host has no address yet (e.g., just registered) — skip.
		return
	}

	resp, err := agent.ListSandboxes(ctx, connect.NewRequest(&pb.ListSandboxesRequest{}))
	if err != nil {
		// RPC failure is a transient condition; the passive phase will catch it
		// if heartbeats stop arriving.
		slog.Debug("host monitor: ListSandboxes failed (transient)", "host_id", id.FormatHostID(host.ID), "error", err)
		return
	}

	// Build map of sandbox ID -> reported status. Transient statuses
	// (pausing/resuming/starting/stopping) are coerced to a presence-only
	// entry: ListSandboxes can observe the in-memory status mid-transition
	// (Pause flips the status under m.mu while List holds m.mu.RLock), and
	// writing those transient labels into the DB would force the transient
	// reconciliation phase to wait the full grace period before resolving.
	// Recording the presence keeps "missing → restore" and "running →
	// orphan-stop" logic correct without overwriting with stale labels;
	// the next monitor tick reads the settled status.
	aliveStatus := make(map[string]string, len(resp.Msg.Sandboxes))
	for _, sb := range resp.Msg.Sandboxes {
		status := sb.Status
		switch status {
		case "pausing", "resuming", "starting", "stopping":
			status = ""
		}
		aliveStatus[sb.SandboxId] = status
	}

	// --- Restore sandboxes that are "missing" in DB but alive on host ---
	// Handles transient heartbeat gaps where the host was actually fine. The
	// reported status must be honored: a sandbox the agent paused while CP
	// was disconnected must not be silently promoted back to running.

	missingSandboxes, err := m.db.ListSandboxesByHostAndStatus(ctx, db.ListSandboxesByHostAndStatusParams{
		HostID:  host.ID,
		Column2: []string{"missing"},
	})
	if err != nil {
		slog.Warn("host monitor: failed to list missing sandboxes", "host_id", id.FormatHostID(host.ID), "error", err)
	} else {
		restoreByStatus := make(map[string][]db.Sandbox)
		var toStop []db.Sandbox
		for _, sb := range missingSandboxes {
			sbIDStr := id.FormatSandboxID(sb.ID)
			status, ok := aliveStatus[sbIDStr]
			if !ok {
				toStop = append(toStop, sb)
				continue
			}
			if status == "" {
				continue
			}
			restoreByStatus[status] = append(restoreByStatus[status], sb)
		}
		for status, sbs := range restoreByStatus {
			ids := make([]pgtype.UUID, len(sbs))
			for i, sb := range sbs {
				ids[i] = sb.ID
			}
			slog.Info("host monitor: restoring missing sandboxes", "host_id", id.FormatHostID(host.ID), "status", status, "count", len(ids))
			if err := m.db.BulkRestoreMissingToStatus(ctx, db.BulkRestoreMissingToStatusParams{
				Column1: ids,
				Status:  status,
			}); err != nil {
				slog.Warn("host monitor: failed to restore missing sandboxes", "host_id", id.FormatHostID(host.ID), "status", status, "error", err)
				continue
			}
			// Only restore→paused emits a notification (per design: running restore is silent).
			if status == "paused" {
				for _, sb := range sbs {
					m.audit.LogSandboxAutoPause(ctx, sb.TeamID, sb.ID, "restored_after_host_recovery", nil)
				}
			}
		}
		if len(toStop) > 0 {
			ids := make([]pgtype.UUID, len(toStop))
			for i, sb := range toStop {
				ids[i] = sb.ID
			}
			slog.Info("host monitor: stopping confirmed-dead missing sandboxes", "host_id", id.FormatHostID(host.ID), "count", len(ids))
			if err := m.db.BulkUpdateStatusByIDs(ctx, db.BulkUpdateStatusByIDsParams{
				Column1: ids,
				Status:  "stopped",
			}); err != nil {
				slog.Warn("host monitor: failed to stop missing sandboxes", "host_id", id.FormatHostID(host.ID), "error", err)
			} else {
				for _, sb := range toStop {
					m.audit.LogSandboxDestroySystem(ctx, sb.TeamID, sb.ID, "orphaned", nil)
				}
			}
		}
	}

	// --- Reconcile running sandboxes in DB against live host state ---
	// Three cases per DB-running row:
	//   absent on host          -> stopped
	//   present and running     -> no change
	//   present but paused/etc. -> sync DB to reported status (catches the
	//                              shutdown-pause notify failure case)

	runningSandboxes, err := m.db.ListSandboxesByHostAndStatus(ctx, db.ListSandboxesByHostAndStatusParams{
		HostID:  host.ID,
		Column2: []string{"running"},
	})
	if err != nil {
		slog.Warn("host monitor: failed to list running sandboxes", "host_id", id.FormatHostID(host.ID), "error", err)
		return
	}

	var toStop []db.Sandbox
	syncByStatus := make(map[string][]db.Sandbox)
	for _, sb := range runningSandboxes {
		sbIDStr := id.FormatSandboxID(sb.ID)
		status, ok := aliveStatus[sbIDStr]
		if !ok {
			toStop = append(toStop, sb)
			continue
		}
		if status == "running" || status == "" {
			continue
		}
		syncByStatus[status] = append(syncByStatus[status], sb)
	}

	if len(toStop) > 0 {
		ids := make([]pgtype.UUID, len(toStop))
		for i, sb := range toStop {
			ids[i] = sb.ID
		}
		slog.Info("host monitor: marking orphaned sandboxes stopped", "host_id", id.FormatHostID(host.ID), "count", len(ids))
		if err := m.db.BulkUpdateStatusByIDs(ctx, db.BulkUpdateStatusByIDsParams{
			Column1: ids,
			Status:  "stopped",
		}); err != nil {
			slog.Warn("host monitor: failed to mark stopped", "host_id", id.FormatHostID(host.ID), "error", err)
		} else {
			for _, sb := range toStop {
				m.audit.LogSandboxDestroySystem(ctx, sb.TeamID, sb.ID, "orphaned", nil)
			}
		}
	}
	for status, sbs := range syncByStatus {
		ids := make([]pgtype.UUID, len(sbs))
		for i, sb := range sbs {
			ids[i] = sb.ID
		}
		slog.Info("host monitor: syncing running→reported status", "host_id", id.FormatHostID(host.ID), "status", status, "count", len(ids))
		if err := m.db.BulkUpdateStatusByIDs(ctx, db.BulkUpdateStatusByIDsParams{
			Column1: ids,
			Status:  status,
		}); err != nil {
			slog.Warn("host monitor: failed to sync running sandboxes", "host_id", id.FormatHostID(host.ID), "status", status, "error", err)
			continue
		}
		if status == "paused" {
			for _, sb := range sbs {
				m.audit.LogSandboxAutoPause(ctx, sb.TeamID, sb.ID, "host_state_sync", nil)
			}
		}
	}

	// --- Reconcile DB-stopped + agent-paused zombies ---
	// A sandbox the agent reports as 'paused' but DB has as 'stopped' is an
	// orphan from a previous bug where a successful pause's auto_paused
	// callback was lost (e.g. CP unreachable during agent shutdown). With the
	// agent-side fix (RestorePausedSandboxes), the snapshot survives across
	// agent restarts and surfaces here. Authoritative direction: DB wins
	// (user already saw 'stopped' and may have stopped tracking it).
	// Issue Destroy so the on-disk snapshot dir is removed and the agent's
	// slot reservation released.
	//
	// Gate: only run the DB query if the agent reports at least one paused
	// sandbox. Otherwise we'd fetch every historically-stopped sandbox on
	// this host every monitor tick — unbounded growth over a host's lifetime.
	hasPaused := false
	for _, status := range aliveStatus {
		if status == "paused" {
			hasPaused = true
			break
		}
	}
	if hasPaused {
		stoppedSandboxes, err := m.db.ListSandboxesByHostAndStatus(ctx, db.ListSandboxesByHostAndStatusParams{
			HostID:  host.ID,
			Column2: []string{"stopped"},
		})
		if err != nil {
			slog.Warn("host monitor: failed to list stopped sandboxes", "host_id", id.FormatHostID(host.ID), "error", err)
		} else {
			for _, sb := range stoppedSandboxes {
				sbIDStr := id.FormatSandboxID(sb.ID)
				status, ok := aliveStatus[sbIDStr]
				if !ok || status != "paused" {
					continue
				}
				slog.Info("host monitor: destroying DB-stopped agent-paused zombie",
					"host_id", id.FormatHostID(host.ID), "sandbox_id", sbIDStr)
				if _, err := agent.DestroySandbox(ctx, connect.NewRequest(&pb.DestroySandboxRequest{
					SandboxId: sbIDStr,
				})); err != nil && connect.CodeOf(err) != connect.CodeNotFound {
					slog.Warn("host monitor: zombie destroy failed",
						"sandbox_id", sbIDStr, "error", err)
					continue
				}
				m.audit.LogSandboxDestroySystem(ctx, sb.TeamID, sb.ID, "paused_zombie_cleanup", nil)
			}
		}
	}

	// --- Reconcile transient statuses (starting, resuming, pausing, stopping) ---
	// These represent in-flight operations. If the sandbox is no longer alive on
	// the host, infer the final state based on the transient status.

	transientSandboxes, err := m.db.ListSandboxesByHostAndStatus(ctx, db.ListSandboxesByHostAndStatusParams{
		HostID:  host.ID,
		Column2: []string{"starting", "resuming", "pausing", "stopping", "snapshotting"},
	})
	if err != nil {
		slog.Warn("host monitor: failed to list transient sandboxes", "host_id", id.FormatHostID(host.ID), "error", err)
		return
	}

	for _, sb := range transientSandboxes {
		sbIDStr := id.FormatSandboxID(sb.ID)
		if agentStatus, ok := aliveStatus[sbIDStr]; ok {
			// Sandbox is alive on host — the background goroutine should
			// finalize the transition. For starting/resuming, if the sandbox
			// is alive it means creation/resume succeeded.
			if sb.Status == "starting" || sb.Status == "resuming" {
				if _, err := m.db.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{
					ID: sb.ID, Status: sb.Status, Status_2: "running",
				}); err == nil {
					slog.Info("host monitor: promoted transient sandbox to running", "sandbox_id", sbIDStr, "from", sb.Status)
				}
			}
			// A snapshot keeps the source sandbox alive throughout, so an alive
			// sandbox does NOT mean the snapshot finished. Only recover it once
			// it has been stuck past the snapshot grace period (i.e. the CP
			// crashed mid-op). Recover to the sandbox's actual host-side status:
			// a running sandbox is snapshotted live and stays running, but a
			// paused sandbox is snapshotted from disk and must return to paused.
			if sb.Status == "snapshotting" &&
				sb.LastUpdated.Valid && time.Since(sb.LastUpdated.Time) >= snapshotGracePeriod {
				recoverTo := agentStatus
				if recoverTo != "running" && recoverTo != "paused" {
					// Coerced/unknown agent label — default to running.
					recoverTo = "running"
				}
				if _, err := m.db.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{
					ID: sb.ID, Status: "snapshotting", Status_2: recoverTo,
				}); err == nil {
					slog.Info("host monitor: recovered stuck snapshotting sandbox", "sandbox_id", sbIDStr, "to", recoverTo)
					m.audit.LogSnapshotCreateSystem(ctx, sb.TeamID, sb.ID, "snapshot_recovered", nil)
				}
			}
			continue
		}
		// Sandbox is not alive on host. If the transition is recent, give the
		// in-flight RPC time to finish before declaring a final state.
		if sb.LastUpdated.Valid && time.Since(sb.LastUpdated.Time) < transientGracePeriod {
			slog.Debug("host monitor: transient sandbox still within grace period",
				"sandbox_id", sbIDStr, "status", sb.Status,
				"age", time.Since(sb.LastUpdated.Time).Round(time.Second))
			continue
		}

		// Grace period expired — infer final state.
		var finalStatus string
		switch sb.Status {
		case "starting", "resuming":
			finalStatus = "error"
		case "pausing":
			finalStatus = "paused"
		case "stopping":
			finalStatus = "stopped"
		case "snapshotting":
			// VM is gone but DB says snapshotting → the snapshot died with the VM.
			finalStatus = "error"
		}
		fromStatus := sb.Status
		if _, err := m.db.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{
			ID: sb.ID, Status: fromStatus, Status_2: finalStatus,
		}); err == nil {
			slog.Info("host monitor: resolved transient sandbox", "sandbox_id", sbIDStr, "from", fromStatus, "to", finalStatus)
			inferredErr := errInferredTransientTimeout
			switch fromStatus {
			case "starting":
				m.audit.LogSandboxCreateSystem(ctx, sb.TeamID, sb.ID, "transient_timeout", inferredErr)
			case "resuming":
				m.audit.LogSandboxResumeSystem(ctx, sb.TeamID, sb.ID, "transient_timeout", inferredErr)
			case "pausing":
				// Pause assumed to have succeeded host-side; emit success with inferred metadata.
				m.audit.LogSandboxAutoPause(ctx, sb.TeamID, sb.ID, "transient_timeout_inferred", nil)
			case "snapshotting":
				// VM gone mid-snapshot; the sandbox is errored.
				m.audit.LogSnapshotCreateSystem(ctx, sb.TeamID, sb.ID, "transient_timeout", inferredErr)
			case "stopping":
				m.audit.LogSandboxDestroySystem(ctx, sb.TeamID, sb.ID, "transient_timeout_inferred", nil)
			}
		}
	}
}