wrenn-releases/internal/api/host_monitor.go

package api

import (
	"context"
	"log/slog"
	"time"

	"connectrpc.com/connect"
	"github.com/jackc/pgx/v5/pgtype"

	"git.omukk.dev/wrenn/wrenn/pkg/audit"
	"git.omukk.dev/wrenn/wrenn/pkg/db"
	"git.omukk.dev/wrenn/wrenn/pkg/id"
	"git.omukk.dev/wrenn/wrenn/pkg/lifecycle"
	pb "git.omukk.dev/wrenn/wrenn/proto/hostagent/gen"
)

// unreachableThreshold is how long a host can go without a heartbeat before
// it is considered unreachable (3 missed 30-second heartbeats).
const unreachableThreshold = 90 * time.Second

// transientGracePeriod is how long a sandbox is allowed to stay in a transient
// status (starting, resuming, pausing, stopping) before the monitor infers a
// final state. This prevents the monitor from racing against in-flight RPCs
// that may not have registered the sandbox on the host agent yet.
const transientGracePeriod = 2 * time.Minute

// HostMonitor runs on a fixed interval and performs two duties:
//
//  1. Passive check: marks hosts whose last_heartbeat_at is stale as
//     "unreachable" and marks their active sandboxes as "missing".
//
//  2. Active reconciliation: for each online host, calls ListSandboxes and
//     reconciles DB state against live host state — restoring "missing"
//     sandboxes that are actually alive, and stopping orphaned ones.
type HostMonitor struct {
	db       *db.Queries
	pool     *lifecycle.HostClientPool
	audit    *audit.AuditLogger
	interval time.Duration
}

// NewHostMonitor creates a HostMonitor.
func NewHostMonitor(queries *db.Queries, pool *lifecycle.HostClientPool, al *audit.AuditLogger, interval time.Duration) *HostMonitor {
	return &HostMonitor{
		db:       queries,
		pool:     pool,
		audit:    al,
		interval: interval,
	}
}

// Start runs the monitor loop until the context is cancelled.
func (m *HostMonitor) Start(ctx context.Context) {
	go func() {
		ticker := time.NewTicker(m.interval)
		defer ticker.Stop()

		// Run immediately on startup so the CP doesn't wait one full interval
		// before reconciling host and sandbox state.
		m.run(ctx)

		for {
			select {
			case <-ctx.Done():
				return
			case <-ticker.C:
				m.run(ctx)
			}
		}
	}()
}

func (m *HostMonitor) run(ctx context.Context) {
	hosts, err := m.db.ListActiveHosts(ctx)
	if err != nil {
		slog.Warn("host monitor: failed to list hosts", "error", err)
		return
	}

	for _, host := range hosts {
		m.checkHost(ctx, host)
	}
}

// ReconcileHost triggers immediate active reconciliation for a single host.
// Called when a host transitions from unreachable → online so sandboxes marked
// "missing" are resolved without waiting for the next monitor tick.
func (m *HostMonitor) ReconcileHost(ctx context.Context, hostID pgtype.UUID) {
	host, err := m.db.GetHost(ctx, hostID)
	if err != nil {
		slog.Warn("host monitor: reconcile-on-connect: failed to get host", "error", err)
		return
	}
	if host.Status != "online" {
		return
	}
	m.checkHost(ctx, host)
}

func (m *HostMonitor) checkHost(ctx context.Context, host db.Host) {
	// --- Passive phase: check heartbeat staleness ---

	stale := !host.LastHeartbeatAt.Valid ||
		time.Since(host.LastHeartbeatAt.Time) > unreachableThreshold

	if stale && host.Status != "unreachable" {
		slog.Info("host monitor: marking host unreachable", "host_id", id.FormatHostID(host.ID),
			"last_heartbeat", host.LastHeartbeatAt.Time)
		if err := m.db.MarkHostUnreachable(ctx, host.ID); err != nil {
			slog.Warn("host monitor: failed to mark host unreachable", "host_id", id.FormatHostID(host.ID), "error", err)
		}
		if err := m.db.MarkSandboxesMissingByHost(ctx, host.ID); err != nil {
			slog.Warn("host monitor: failed to mark sandboxes missing", "host_id", id.FormatHostID(host.ID), "error", err)
		}
		m.audit.LogHostMarkedDown(ctx, host.TeamID, host.ID)
		return
	}

	// --- Active reconciliation: only for online hosts ---

	if host.Status != "online" {
		return
	}

	agent, err := m.pool.GetForHost(host)
	if err != nil {
		// Host has no address yet (e.g., just registered) — skip.
		return
	}

	resp, err := agent.ListSandboxes(ctx, connect.NewRequest(&pb.ListSandboxesRequest{}))
	if err != nil {
		// RPC failure is a transient condition; the passive phase will catch it
		// if heartbeats stop arriving.
		slog.Debug("host monitor: ListSandboxes failed (transient)", "host_id", id.FormatHostID(host.ID), "error", err)
		return
	}

	// Build set of sandbox IDs alive on the host.
	// The host agent returns sandbox IDs as strings (formatted with prefix).
	alive := make(map[string]struct{}, len(resp.Msg.Sandboxes))
	for _, sb := range resp.Msg.Sandboxes {
		alive[sb.SandboxId] = struct{}{}
	}

	autoPaused := make(map[string]struct{}, len(resp.Msg.AutoPausedSandboxIds))
	for _, apID := range resp.Msg.AutoPausedSandboxIds {
		autoPaused[apID] = struct{}{}
	}

	// --- Restore sandboxes that are "missing" in DB but alive on host ---
	// This handles the case where CP marked them missing due to a transient
	// heartbeat gap, but the host was actually fine.

	missingSandboxes, err := m.db.ListSandboxesByHostAndStatus(ctx, db.ListSandboxesByHostAndStatusParams{
		HostID:  host.ID,
		Column2: []string{"missing"},
	})
	if err != nil {
		slog.Warn("host monitor: failed to list missing sandboxes", "host_id", id.FormatHostID(host.ID), "error", err)
	} else {
		var toRestore []pgtype.UUID
		var toStop []pgtype.UUID
		for _, sb := range missingSandboxes {
			sbIDStr := id.FormatSandboxID(sb.ID)
			if _, ok := alive[sbIDStr]; ok {
				toRestore = append(toRestore, sb.ID)
			} else {
				toStop = append(toStop, sb.ID)
			}
		}
		if len(toRestore) > 0 {
			slog.Info("host monitor: restoring missing sandboxes", "host_id", id.FormatHostID(host.ID), "count", len(toRestore))
			if err := m.db.BulkRestoreRunning(ctx, toRestore); err != nil {
				slog.Warn("host monitor: failed to restore missing sandboxes", "host_id", id.FormatHostID(host.ID), "error", err)
			}
		}
		if len(toStop) > 0 {
			slog.Info("host monitor: stopping confirmed-dead missing sandboxes", "host_id", id.FormatHostID(host.ID), "count", len(toStop))
			if err := m.db.BulkUpdateStatusByIDs(ctx, db.BulkUpdateStatusByIDsParams{
				Column1: toStop,
				Status:  "stopped",
			}); err != nil {
				slog.Warn("host monitor: failed to stop missing sandboxes", "host_id", id.FormatHostID(host.ID), "error", err)
			}
		}
	}

	// --- Find running sandboxes in DB that are no longer alive on the host ---

	runningSandboxes, err := m.db.ListSandboxesByHostAndStatus(ctx, db.ListSandboxesByHostAndStatusParams{
		HostID:  host.ID,
		Column2: []string{"running"},
	})
	if err != nil {
		slog.Warn("host monitor: failed to list running sandboxes", "host_id", id.FormatHostID(host.ID), "error", err)
		return
	}

	var toPause, toStop []pgtype.UUID
	sbTeamID := make(map[pgtype.UUID]pgtype.UUID, len(runningSandboxes))
	for _, sb := range runningSandboxes {
		sbIDStr := id.FormatSandboxID(sb.ID)
		sbTeamID[sb.ID] = sb.TeamID
		if _, ok := alive[sbIDStr]; ok {
			continue
		}
		if _, ok := autoPaused[sbIDStr]; ok {
			toPause = append(toPause, sb.ID)
		} else {
			toStop = append(toStop, sb.ID)
		}
	}

	if len(toPause) > 0 {
		slog.Info("host monitor: marking auto-paused sandboxes", "host_id", id.FormatHostID(host.ID), "count", len(toPause))
		if err := m.db.BulkUpdateStatusByIDs(ctx, db.BulkUpdateStatusByIDsParams{
			Column1: toPause,
			Status:  "paused",
		}); err != nil {
			slog.Warn("host monitor: failed to mark paused", "host_id", id.FormatHostID(host.ID), "error", err)
		}
		for _, sbID := range toPause {
			m.audit.LogSandboxAutoPause(ctx, sbTeamID[sbID], sbID)
		}
	}
	if len(toStop) > 0 {
		slog.Info("host monitor: marking orphaned sandboxes stopped", "host_id", id.FormatHostID(host.ID), "count", len(toStop))
		if err := m.db.BulkUpdateStatusByIDs(ctx, db.BulkUpdateStatusByIDsParams{
			Column1: toStop,
			Status:  "stopped",
		}); err != nil {
			slog.Warn("host monitor: failed to mark stopped", "host_id", id.FormatHostID(host.ID), "error", err)
		}
	}

	// --- Reconcile transient statuses (starting, resuming, pausing, stopping) ---
	// These represent in-flight operations. If the sandbox is no longer alive on
	// the host, infer the final state based on the transient status.

	transientSandboxes, err := m.db.ListSandboxesByHostAndStatus(ctx, db.ListSandboxesByHostAndStatusParams{
		HostID:  host.ID,
		Column2: []string{"starting", "resuming", "pausing", "stopping"},
	})
	if err != nil {
		slog.Warn("host monitor: failed to list transient sandboxes", "host_id", id.FormatHostID(host.ID), "error", err)
		return
	}

	for _, sb := range transientSandboxes {
		sbIDStr := id.FormatSandboxID(sb.ID)
		if _, ok := alive[sbIDStr]; ok {
			// Sandbox is alive on host — the background goroutine should
			// finalize the transition. For starting/resuming, if the sandbox
			// is alive it means creation/resume succeeded.
			if sb.Status == "starting" || sb.Status == "resuming" {
				if _, err := m.db.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{
					ID: sb.ID, Status: sb.Status, Status_2: "running",
				}); err == nil {
					slog.Info("host monitor: promoted transient sandbox to running", "sandbox_id", sbIDStr, "from", sb.Status)
				}
			}
			continue
		}
		// Sandbox is not alive on host. If the transition is recent, give the
		// in-flight RPC time to finish before declaring a final state.
		if sb.LastUpdated.Valid && time.Since(sb.LastUpdated.Time) < transientGracePeriod {
			slog.Debug("host monitor: transient sandbox still within grace period",
				"sandbox_id", sbIDStr, "status", sb.Status,
				"age", time.Since(sb.LastUpdated.Time).Round(time.Second))
			continue
		}

		// Grace period expired — infer final state.
		var finalStatus string
		switch sb.Status {
		case "starting", "resuming":
			finalStatus = "error"
		case "pausing":
			finalStatus = "paused"
		case "stopping":
			finalStatus = "stopped"
		}
		if _, err := m.db.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{
			ID: sb.ID, Status: sb.Status, Status_2: finalStatus,
		}); err == nil {
			slog.Info("host monitor: resolved transient sandbox", "sandbox_id", sbIDStr, "from", sb.Status, "to", finalStatus)
		}
	}
}