wrenn-releases/internal/api/host_monitor.go

package api

import (
	"context"
	"log/slog"
	"time"

	"connectrpc.com/connect"
	"github.com/jackc/pgx/v5/pgtype"

	"git.omukk.dev/wrenn/sandbox/internal/audit"
	"git.omukk.dev/wrenn/sandbox/internal/db"
	"git.omukk.dev/wrenn/sandbox/internal/id"
	"git.omukk.dev/wrenn/sandbox/internal/lifecycle"
	pb "git.omukk.dev/wrenn/sandbox/proto/hostagent/gen"
)

// unreachableThreshold is how long a host can go without a heartbeat before
// it is considered unreachable (3 missed 30-second heartbeats).
const unreachableThreshold = 90 * time.Second

// HostMonitor runs on a fixed interval and performs two duties:
//
//  1. Passive check: marks hosts whose last_heartbeat_at is stale as
//     "unreachable" and marks their active sandboxes as "missing".
//
//  2. Active reconciliation: for each online host, calls ListSandboxes and
//     reconciles DB state against live host state — restoring "missing"
//     sandboxes that are actually alive, and stopping orphaned ones.
type HostMonitor struct {
	db       *db.Queries
	pool     *lifecycle.HostClientPool
	audit    *audit.AuditLogger
	interval time.Duration
}

// NewHostMonitor creates a HostMonitor.
func NewHostMonitor(queries *db.Queries, pool *lifecycle.HostClientPool, al *audit.AuditLogger, interval time.Duration) *HostMonitor {
	return &HostMonitor{
		db:       queries,
		pool:     pool,
		audit:    al,
		interval: interval,
	}
}

// Start runs the monitor loop until the context is cancelled.
func (m *HostMonitor) Start(ctx context.Context) {
	go func() {
		ticker := time.NewTicker(m.interval)
		defer ticker.Stop()

		// Run immediately on startup so the CP doesn't wait one full interval
		// before reconciling host and sandbox state.
		m.run(ctx)

		for {
			select {
			case <-ctx.Done():
				return
			case <-ticker.C:
				m.run(ctx)
			}
		}
	}()
}

func (m *HostMonitor) run(ctx context.Context) {
	hosts, err := m.db.ListActiveHosts(ctx)
	if err != nil {
		slog.Warn("host monitor: failed to list hosts", "error", err)
		return
	}

	for _, host := range hosts {
		m.checkHost(ctx, host)
	}
}

func (m *HostMonitor) checkHost(ctx context.Context, host db.Host) {
	// --- Passive phase: check heartbeat staleness ---

	stale := !host.LastHeartbeatAt.Valid ||
		time.Since(host.LastHeartbeatAt.Time) > unreachableThreshold

	if stale && host.Status != "unreachable" {
		slog.Info("host monitor: marking host unreachable", "host_id", id.FormatHostID(host.ID),
			"last_heartbeat", host.LastHeartbeatAt.Time)
		if err := m.db.MarkHostUnreachable(ctx, host.ID); err != nil {
			slog.Warn("host monitor: failed to mark host unreachable", "host_id", id.FormatHostID(host.ID), "error", err)
		}
		if err := m.db.MarkSandboxesMissingByHost(ctx, host.ID); err != nil {
			slog.Warn("host monitor: failed to mark sandboxes missing", "host_id", id.FormatHostID(host.ID), "error", err)
		}
		m.audit.LogHostMarkedDown(ctx, host.TeamID, host.ID)
		return
	}

	// --- Active reconciliation: only for online hosts ---

	if host.Status != "online" {
		return
	}

	agent, err := m.pool.GetForHost(host)
	if err != nil {
		// Host has no address yet (e.g., just registered) — skip.
		return
	}

	resp, err := agent.ListSandboxes(ctx, connect.NewRequest(&pb.ListSandboxesRequest{}))
	if err != nil {
		// RPC failure is a transient condition; the passive phase will catch it
		// if heartbeats stop arriving.
		slog.Debug("host monitor: ListSandboxes failed (transient)", "host_id", id.FormatHostID(host.ID), "error", err)
		return
	}

	// Build set of sandbox IDs alive on the host.
	// The host agent returns sandbox IDs as strings (formatted with prefix).
	alive := make(map[string]struct{}, len(resp.Msg.Sandboxes))
	for _, sb := range resp.Msg.Sandboxes {
		alive[sb.SandboxId] = struct{}{}
	}

	autoPaused := make(map[string]struct{}, len(resp.Msg.AutoPausedSandboxIds))
	for _, apID := range resp.Msg.AutoPausedSandboxIds {
		autoPaused[apID] = struct{}{}
	}

	// --- Restore sandboxes that are "missing" in DB but alive on host ---
	// This handles the case where CP marked them missing due to a transient
	// heartbeat gap, but the host was actually fine.

	missingSandboxes, err := m.db.ListSandboxesByHostAndStatus(ctx, db.ListSandboxesByHostAndStatusParams{
		HostID:  host.ID,
		Column2: []string{"missing"},
	})
	if err != nil {
		slog.Warn("host monitor: failed to list missing sandboxes", "host_id", id.FormatHostID(host.ID), "error", err)
	} else {
		var toRestore []pgtype.UUID
		var toStop []pgtype.UUID
		for _, sb := range missingSandboxes {
			sbIDStr := id.FormatSandboxID(sb.ID)
			if _, ok := alive[sbIDStr]; ok {
				toRestore = append(toRestore, sb.ID)
			} else {
				toStop = append(toStop, sb.ID)
			}
		}
		if len(toRestore) > 0 {
			slog.Info("host monitor: restoring missing sandboxes", "host_id", id.FormatHostID(host.ID), "count", len(toRestore))
			if err := m.db.BulkRestoreRunning(ctx, toRestore); err != nil {
				slog.Warn("host monitor: failed to restore missing sandboxes", "host_id", id.FormatHostID(host.ID), "error", err)
			}
		}
		if len(toStop) > 0 {
			slog.Info("host monitor: stopping confirmed-dead missing sandboxes", "host_id", id.FormatHostID(host.ID), "count", len(toStop))
			if err := m.db.BulkUpdateStatusByIDs(ctx, db.BulkUpdateStatusByIDsParams{
				Column1: toStop,
				Status:  "stopped",
			}); err != nil {
				slog.Warn("host monitor: failed to stop missing sandboxes", "host_id", id.FormatHostID(host.ID), "error", err)
			}
		}
	}

	// --- Find running sandboxes in DB that are no longer alive on the host ---

	runningSandboxes, err := m.db.ListSandboxesByHostAndStatus(ctx, db.ListSandboxesByHostAndStatusParams{
		HostID:  host.ID,
		Column2: []string{"running"},
	})
	if err != nil {
		slog.Warn("host monitor: failed to list running sandboxes", "host_id", id.FormatHostID(host.ID), "error", err)
		return
	}

	var toPause, toStop []pgtype.UUID
	sbTeamID := make(map[pgtype.UUID]pgtype.UUID, len(runningSandboxes))
	for _, sb := range runningSandboxes {
		sbIDStr := id.FormatSandboxID(sb.ID)
		sbTeamID[sb.ID] = sb.TeamID
		if _, ok := alive[sbIDStr]; ok {
			continue
		}
		if _, ok := autoPaused[sbIDStr]; ok {
			toPause = append(toPause, sb.ID)
		} else {
			toStop = append(toStop, sb.ID)
		}
	}

	if len(toPause) > 0 {
		slog.Info("host monitor: marking auto-paused sandboxes", "host_id", id.FormatHostID(host.ID), "count", len(toPause))
		if err := m.db.BulkUpdateStatusByIDs(ctx, db.BulkUpdateStatusByIDsParams{
			Column1: toPause,
			Status:  "paused",
		}); err != nil {
			slog.Warn("host monitor: failed to mark paused", "host_id", id.FormatHostID(host.ID), "error", err)
		}
		for _, sbID := range toPause {
			m.audit.LogSandboxAutoPause(ctx, sbTeamID[sbID], sbID)
		}
	}
	if len(toStop) > 0 {
		slog.Info("host monitor: marking orphaned sandboxes stopped", "host_id", id.FormatHostID(host.ID), "count", len(toStop))
		if err := m.db.BulkUpdateStatusByIDs(ctx, db.BulkUpdateStatusByIDsParams{
			Column1: toStop,
			Status:  "stopped",
		}); err != nil {
			slog.Warn("host monitor: failed to mark stopped", "host_id", id.FormatHostID(host.ID), "error", err)
		}
	}
}