forked from wrenn/wrenn
v0.2.0 (#50)
Co-authored-by: Tasnim Kabir Sadik <tksadik@omukk.dev> Reviewed-on: wrenn/wrenn#50
This commit is contained in:
@ -2,6 +2,7 @@ package api
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"log/slog"
|
||||
"time"
|
||||
|
||||
@ -15,10 +16,30 @@ import (
|
||||
pb "git.omukk.dev/wrenn/wrenn/proto/hostagent/gen"
|
||||
)
|
||||
|
||||
// errInferredTransientTimeout marks a state change that the reconciler
|
||||
// inferred after a transient (starting/resuming) sandbox failed to settle
|
||||
// within the grace period. Used as the err value on system audit calls so
|
||||
// the published event carries Outcome=error with a human-readable message.
|
||||
var errInferredTransientTimeout = errors.New("transient state did not settle within grace period")
|
||||
|
||||
// unreachableThreshold is how long a host can go without a heartbeat before
|
||||
// it is considered unreachable (3 missed 30-second heartbeats).
|
||||
const unreachableThreshold = 90 * time.Second
|
||||
|
||||
// transientGracePeriod is how long a sandbox is allowed to stay in a transient
|
||||
// status (starting, resuming, pausing, stopping) before the monitor infers a
|
||||
// final state. This prevents the monitor from racing against in-flight RPCs
|
||||
// that may not have registered the sandbox on the host agent yet.
|
||||
const transientGracePeriod = 2 * time.Minute
|
||||
|
||||
// snapshotGracePeriod is the grace for a sandbox stuck in "snapshotting" while
|
||||
// the VM is still alive on the host. Snapshots dump guest RAM and flatten the
|
||||
// rootfs, which can run for minutes on large sandboxes, and the agent reports
|
||||
// the VM as alive throughout — so we must not race the in-flight operation.
|
||||
// It exceeds the background goroutine's 10-minute deadline, so reaching it
|
||||
// means the control plane crashed mid-snapshot and the sandbox needs recovery.
|
||||
const snapshotGracePeriod = 15 * time.Minute
|
||||
|
||||
// HostMonitor runs on a fixed interval and performs two duties:
|
||||
//
|
||||
// 1. Passive check: marks hosts whose last_heartbeat_at is stale as
|
||||
@ -77,6 +98,21 @@ func (m *HostMonitor) run(ctx context.Context) {
|
||||
}
|
||||
}
|
||||
|
||||
// ReconcileHost triggers immediate active reconciliation for a single host.
|
||||
// Called when a host transitions from unreachable → online so sandboxes marked
|
||||
// "missing" are resolved without waiting for the next monitor tick.
|
||||
func (m *HostMonitor) ReconcileHost(ctx context.Context, hostID pgtype.UUID) {
|
||||
host, err := m.db.GetHost(ctx, hostID)
|
||||
if err != nil {
|
||||
slog.Warn("host monitor: reconcile-on-connect: failed to get host", "error", err)
|
||||
return
|
||||
}
|
||||
if host.Status != "online" {
|
||||
return
|
||||
}
|
||||
m.checkHost(ctx, host)
|
||||
}
|
||||
|
||||
func (m *HostMonitor) checkHost(ctx context.Context, host db.Host) {
|
||||
// --- Passive phase: check heartbeat staleness ---
|
||||
|
||||
@ -116,21 +152,29 @@ func (m *HostMonitor) checkHost(ctx context.Context, host db.Host) {
|
||||
return
|
||||
}
|
||||
|
||||
// Build set of sandbox IDs alive on the host.
|
||||
// The host agent returns sandbox IDs as strings (formatted with prefix).
|
||||
alive := make(map[string]struct{}, len(resp.Msg.Sandboxes))
|
||||
// Build map of sandbox ID -> reported status. Transient statuses
|
||||
// (pausing/resuming/starting/stopping) are coerced to a presence-only
|
||||
// entry: ListSandboxes can observe the in-memory status mid-transition
|
||||
// (Pause flips the status under m.mu while List holds m.mu.RLock), and
|
||||
// writing those transient labels into the DB would force the transient
|
||||
// reconciliation phase to wait the full grace period before resolving.
|
||||
// Recording the presence keeps "missing → restore" and "running →
|
||||
// orphan-stop" logic correct without overwriting with stale labels;
|
||||
// the next monitor tick reads the settled status.
|
||||
aliveStatus := make(map[string]string, len(resp.Msg.Sandboxes))
|
||||
for _, sb := range resp.Msg.Sandboxes {
|
||||
alive[sb.SandboxId] = struct{}{}
|
||||
}
|
||||
|
||||
autoPaused := make(map[string]struct{}, len(resp.Msg.AutoPausedSandboxIds))
|
||||
for _, apID := range resp.Msg.AutoPausedSandboxIds {
|
||||
autoPaused[apID] = struct{}{}
|
||||
status := sb.Status
|
||||
switch status {
|
||||
case "pausing", "resuming", "starting", "stopping":
|
||||
status = ""
|
||||
}
|
||||
aliveStatus[sb.SandboxId] = status
|
||||
}
|
||||
|
||||
// --- Restore sandboxes that are "missing" in DB but alive on host ---
|
||||
// This handles the case where CP marked them missing due to a transient
|
||||
// heartbeat gap, but the host was actually fine.
|
||||
// Handles transient heartbeat gaps where the host was actually fine. The
|
||||
// reported status must be honored: a sandbox the agent paused while CP
|
||||
// was disconnected must not be silently promoted back to running.
|
||||
|
||||
missingSandboxes, err := m.db.ListSandboxesByHostAndStatus(ctx, db.ListSandboxesByHostAndStatusParams{
|
||||
HostID: host.ID,
|
||||
@ -139,34 +183,65 @@ func (m *HostMonitor) checkHost(ctx context.Context, host db.Host) {
|
||||
if err != nil {
|
||||
slog.Warn("host monitor: failed to list missing sandboxes", "host_id", id.FormatHostID(host.ID), "error", err)
|
||||
} else {
|
||||
var toRestore []pgtype.UUID
|
||||
var toStop []pgtype.UUID
|
||||
restoreByStatus := make(map[string][]db.Sandbox)
|
||||
var toStop []db.Sandbox
|
||||
for _, sb := range missingSandboxes {
|
||||
sbIDStr := id.FormatSandboxID(sb.ID)
|
||||
if _, ok := alive[sbIDStr]; ok {
|
||||
toRestore = append(toRestore, sb.ID)
|
||||
} else {
|
||||
toStop = append(toStop, sb.ID)
|
||||
status, ok := aliveStatus[sbIDStr]
|
||||
if !ok {
|
||||
toStop = append(toStop, sb)
|
||||
continue
|
||||
}
|
||||
if status == "" {
|
||||
continue
|
||||
}
|
||||
restoreByStatus[status] = append(restoreByStatus[status], sb)
|
||||
}
|
||||
if len(toRestore) > 0 {
|
||||
slog.Info("host monitor: restoring missing sandboxes", "host_id", id.FormatHostID(host.ID), "count", len(toRestore))
|
||||
if err := m.db.BulkRestoreRunning(ctx, toRestore); err != nil {
|
||||
slog.Warn("host monitor: failed to restore missing sandboxes", "host_id", id.FormatHostID(host.ID), "error", err)
|
||||
for status, sbs := range restoreByStatus {
|
||||
ids := make([]pgtype.UUID, len(sbs))
|
||||
for i, sb := range sbs {
|
||||
ids[i] = sb.ID
|
||||
}
|
||||
slog.Info("host monitor: restoring missing sandboxes", "host_id", id.FormatHostID(host.ID), "status", status, "count", len(ids))
|
||||
if err := m.db.BulkRestoreMissingToStatus(ctx, db.BulkRestoreMissingToStatusParams{
|
||||
Column1: ids,
|
||||
Status: status,
|
||||
}); err != nil {
|
||||
slog.Warn("host monitor: failed to restore missing sandboxes", "host_id", id.FormatHostID(host.ID), "status", status, "error", err)
|
||||
continue
|
||||
}
|
||||
// Only restore→paused emits a notification (per design: running restore is silent).
|
||||
if status == "paused" {
|
||||
for _, sb := range sbs {
|
||||
m.audit.LogSandboxAutoPause(ctx, sb.TeamID, sb.ID, "restored_after_host_recovery", nil)
|
||||
}
|
||||
}
|
||||
}
|
||||
if len(toStop) > 0 {
|
||||
slog.Info("host monitor: stopping confirmed-dead missing sandboxes", "host_id", id.FormatHostID(host.ID), "count", len(toStop))
|
||||
ids := make([]pgtype.UUID, len(toStop))
|
||||
for i, sb := range toStop {
|
||||
ids[i] = sb.ID
|
||||
}
|
||||
slog.Info("host monitor: stopping confirmed-dead missing sandboxes", "host_id", id.FormatHostID(host.ID), "count", len(ids))
|
||||
if err := m.db.BulkUpdateStatusByIDs(ctx, db.BulkUpdateStatusByIDsParams{
|
||||
Column1: toStop,
|
||||
Column1: ids,
|
||||
Status: "stopped",
|
||||
}); err != nil {
|
||||
slog.Warn("host monitor: failed to stop missing sandboxes", "host_id", id.FormatHostID(host.ID), "error", err)
|
||||
} else {
|
||||
for _, sb := range toStop {
|
||||
m.audit.LogSandboxDestroySystem(ctx, sb.TeamID, sb.ID, "orphaned", nil)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// --- Find running sandboxes in DB that are no longer alive on the host ---
|
||||
// --- Reconcile running sandboxes in DB against live host state ---
|
||||
// Three cases per DB-running row:
|
||||
// absent on host -> stopped
|
||||
// present and running -> no change
|
||||
// present but paused/etc. -> sync DB to reported status (catches the
|
||||
// shutdown-pause notify failure case)
|
||||
|
||||
runningSandboxes, err := m.db.ListSandboxesByHostAndStatus(ctx, db.ListSandboxesByHostAndStatusParams{
|
||||
HostID: host.ID,
|
||||
@ -177,40 +252,196 @@ func (m *HostMonitor) checkHost(ctx context.Context, host db.Host) {
|
||||
return
|
||||
}
|
||||
|
||||
var toPause, toStop []pgtype.UUID
|
||||
sbTeamID := make(map[pgtype.UUID]pgtype.UUID, len(runningSandboxes))
|
||||
var toStop []db.Sandbox
|
||||
syncByStatus := make(map[string][]db.Sandbox)
|
||||
for _, sb := range runningSandboxes {
|
||||
sbIDStr := id.FormatSandboxID(sb.ID)
|
||||
sbTeamID[sb.ID] = sb.TeamID
|
||||
if _, ok := alive[sbIDStr]; ok {
|
||||
status, ok := aliveStatus[sbIDStr]
|
||||
if !ok {
|
||||
toStop = append(toStop, sb)
|
||||
continue
|
||||
}
|
||||
if _, ok := autoPaused[sbIDStr]; ok {
|
||||
toPause = append(toPause, sb.ID)
|
||||
} else {
|
||||
toStop = append(toStop, sb.ID)
|
||||
if status == "running" || status == "" {
|
||||
continue
|
||||
}
|
||||
syncByStatus[status] = append(syncByStatus[status], sb)
|
||||
}
|
||||
|
||||
if len(toPause) > 0 {
|
||||
slog.Info("host monitor: marking auto-paused sandboxes", "host_id", id.FormatHostID(host.ID), "count", len(toPause))
|
||||
if err := m.db.BulkUpdateStatusByIDs(ctx, db.BulkUpdateStatusByIDsParams{
|
||||
Column1: toPause,
|
||||
Status: "paused",
|
||||
}); err != nil {
|
||||
slog.Warn("host monitor: failed to mark paused", "host_id", id.FormatHostID(host.ID), "error", err)
|
||||
}
|
||||
for _, sbID := range toPause {
|
||||
m.audit.LogSandboxAutoPause(ctx, sbTeamID[sbID], sbID)
|
||||
}
|
||||
}
|
||||
if len(toStop) > 0 {
|
||||
slog.Info("host monitor: marking orphaned sandboxes stopped", "host_id", id.FormatHostID(host.ID), "count", len(toStop))
|
||||
ids := make([]pgtype.UUID, len(toStop))
|
||||
for i, sb := range toStop {
|
||||
ids[i] = sb.ID
|
||||
}
|
||||
slog.Info("host monitor: marking orphaned sandboxes stopped", "host_id", id.FormatHostID(host.ID), "count", len(ids))
|
||||
if err := m.db.BulkUpdateStatusByIDs(ctx, db.BulkUpdateStatusByIDsParams{
|
||||
Column1: toStop,
|
||||
Column1: ids,
|
||||
Status: "stopped",
|
||||
}); err != nil {
|
||||
slog.Warn("host monitor: failed to mark stopped", "host_id", id.FormatHostID(host.ID), "error", err)
|
||||
} else {
|
||||
for _, sb := range toStop {
|
||||
m.audit.LogSandboxDestroySystem(ctx, sb.TeamID, sb.ID, "orphaned", nil)
|
||||
}
|
||||
}
|
||||
}
|
||||
for status, sbs := range syncByStatus {
|
||||
ids := make([]pgtype.UUID, len(sbs))
|
||||
for i, sb := range sbs {
|
||||
ids[i] = sb.ID
|
||||
}
|
||||
slog.Info("host monitor: syncing running→reported status", "host_id", id.FormatHostID(host.ID), "status", status, "count", len(ids))
|
||||
if err := m.db.BulkUpdateStatusByIDs(ctx, db.BulkUpdateStatusByIDsParams{
|
||||
Column1: ids,
|
||||
Status: status,
|
||||
}); err != nil {
|
||||
slog.Warn("host monitor: failed to sync running sandboxes", "host_id", id.FormatHostID(host.ID), "status", status, "error", err)
|
||||
continue
|
||||
}
|
||||
if status == "paused" {
|
||||
for _, sb := range sbs {
|
||||
m.audit.LogSandboxAutoPause(ctx, sb.TeamID, sb.ID, "host_state_sync", nil)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// --- Reconcile DB-stopped + agent-paused zombies ---
|
||||
// A sandbox the agent reports as 'paused' but DB has as 'stopped' is an
|
||||
// orphan from a previous bug where a successful pause's auto_paused
|
||||
// callback was lost (e.g. CP unreachable during agent shutdown). With the
|
||||
// agent-side fix (RestorePausedSandboxes), the snapshot survives across
|
||||
// agent restarts and surfaces here. Authoritative direction: DB wins
|
||||
// (user already saw 'stopped' and may have stopped tracking it).
|
||||
// Issue Destroy so the on-disk snapshot dir is removed and the agent's
|
||||
// slot reservation released.
|
||||
//
|
||||
// Gate: only run the DB query if the agent reports at least one paused
|
||||
// sandbox. Otherwise we'd fetch every historically-stopped sandbox on
|
||||
// this host every monitor tick — unbounded growth over a host's lifetime.
|
||||
hasPaused := false
|
||||
for _, status := range aliveStatus {
|
||||
if status == "paused" {
|
||||
hasPaused = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if hasPaused {
|
||||
stoppedSandboxes, err := m.db.ListSandboxesByHostAndStatus(ctx, db.ListSandboxesByHostAndStatusParams{
|
||||
HostID: host.ID,
|
||||
Column2: []string{"stopped"},
|
||||
})
|
||||
if err != nil {
|
||||
slog.Warn("host monitor: failed to list stopped sandboxes", "host_id", id.FormatHostID(host.ID), "error", err)
|
||||
} else {
|
||||
for _, sb := range stoppedSandboxes {
|
||||
sbIDStr := id.FormatSandboxID(sb.ID)
|
||||
status, ok := aliveStatus[sbIDStr]
|
||||
if !ok || status != "paused" {
|
||||
continue
|
||||
}
|
||||
slog.Info("host monitor: destroying DB-stopped agent-paused zombie",
|
||||
"host_id", id.FormatHostID(host.ID), "sandbox_id", sbIDStr)
|
||||
if _, err := agent.DestroySandbox(ctx, connect.NewRequest(&pb.DestroySandboxRequest{
|
||||
SandboxId: sbIDStr,
|
||||
})); err != nil && connect.CodeOf(err) != connect.CodeNotFound {
|
||||
slog.Warn("host monitor: zombie destroy failed",
|
||||
"sandbox_id", sbIDStr, "error", err)
|
||||
continue
|
||||
}
|
||||
m.audit.LogSandboxDestroySystem(ctx, sb.TeamID, sb.ID, "paused_zombie_cleanup", nil)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// --- Reconcile transient statuses (starting, resuming, pausing, stopping) ---
|
||||
// These represent in-flight operations. If the sandbox is no longer alive on
|
||||
// the host, infer the final state based on the transient status.
|
||||
|
||||
transientSandboxes, err := m.db.ListSandboxesByHostAndStatus(ctx, db.ListSandboxesByHostAndStatusParams{
|
||||
HostID: host.ID,
|
||||
Column2: []string{"starting", "resuming", "pausing", "stopping", "snapshotting"},
|
||||
})
|
||||
if err != nil {
|
||||
slog.Warn("host monitor: failed to list transient sandboxes", "host_id", id.FormatHostID(host.ID), "error", err)
|
||||
return
|
||||
}
|
||||
|
||||
for _, sb := range transientSandboxes {
|
||||
sbIDStr := id.FormatSandboxID(sb.ID)
|
||||
if agentStatus, ok := aliveStatus[sbIDStr]; ok {
|
||||
// Sandbox is alive on host — the background goroutine should
|
||||
// finalize the transition. For starting/resuming, if the sandbox
|
||||
// is alive it means creation/resume succeeded.
|
||||
if sb.Status == "starting" || sb.Status == "resuming" {
|
||||
if _, err := m.db.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{
|
||||
ID: sb.ID, Status: sb.Status, Status_2: "running",
|
||||
}); err == nil {
|
||||
slog.Info("host monitor: promoted transient sandbox to running", "sandbox_id", sbIDStr, "from", sb.Status)
|
||||
}
|
||||
}
|
||||
// A snapshot keeps the source sandbox alive throughout, so an alive
|
||||
// sandbox does NOT mean the snapshot finished. Only recover it once
|
||||
// it has been stuck past the snapshot grace period (i.e. the CP
|
||||
// crashed mid-op). Recover to the sandbox's actual host-side status:
|
||||
// a running sandbox is snapshotted live and stays running, but a
|
||||
// paused sandbox is snapshotted from disk and must return to paused.
|
||||
if sb.Status == "snapshotting" &&
|
||||
sb.LastUpdated.Valid && time.Since(sb.LastUpdated.Time) >= snapshotGracePeriod {
|
||||
recoverTo := agentStatus
|
||||
if recoverTo != "running" && recoverTo != "paused" {
|
||||
// Coerced/unknown agent label — default to running.
|
||||
recoverTo = "running"
|
||||
}
|
||||
if _, err := m.db.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{
|
||||
ID: sb.ID, Status: "snapshotting", Status_2: recoverTo,
|
||||
}); err == nil {
|
||||
slog.Info("host monitor: recovered stuck snapshotting sandbox", "sandbox_id", sbIDStr, "to", recoverTo)
|
||||
m.audit.LogSnapshotCreateSystem(ctx, sb.TeamID, sb.ID, "snapshot_recovered", nil)
|
||||
}
|
||||
}
|
||||
continue
|
||||
}
|
||||
// Sandbox is not alive on host. If the transition is recent, give the
|
||||
// in-flight RPC time to finish before declaring a final state.
|
||||
if sb.LastUpdated.Valid && time.Since(sb.LastUpdated.Time) < transientGracePeriod {
|
||||
slog.Debug("host monitor: transient sandbox still within grace period",
|
||||
"sandbox_id", sbIDStr, "status", sb.Status,
|
||||
"age", time.Since(sb.LastUpdated.Time).Round(time.Second))
|
||||
continue
|
||||
}
|
||||
|
||||
// Grace period expired — infer final state.
|
||||
var finalStatus string
|
||||
switch sb.Status {
|
||||
case "starting", "resuming":
|
||||
finalStatus = "error"
|
||||
case "pausing":
|
||||
finalStatus = "paused"
|
||||
case "stopping":
|
||||
finalStatus = "stopped"
|
||||
case "snapshotting":
|
||||
// VM is gone but DB says snapshotting → the snapshot died with the VM.
|
||||
finalStatus = "error"
|
||||
}
|
||||
fromStatus := sb.Status
|
||||
if _, err := m.db.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{
|
||||
ID: sb.ID, Status: fromStatus, Status_2: finalStatus,
|
||||
}); err == nil {
|
||||
slog.Info("host monitor: resolved transient sandbox", "sandbox_id", sbIDStr, "from", fromStatus, "to", finalStatus)
|
||||
inferredErr := errInferredTransientTimeout
|
||||
switch fromStatus {
|
||||
case "starting":
|
||||
m.audit.LogSandboxCreateSystem(ctx, sb.TeamID, sb.ID, "transient_timeout", inferredErr)
|
||||
case "resuming":
|
||||
m.audit.LogSandboxResumeSystem(ctx, sb.TeamID, sb.ID, "transient_timeout", inferredErr)
|
||||
case "pausing":
|
||||
// Pause assumed to have succeeded host-side; emit success with inferred metadata.
|
||||
m.audit.LogSandboxAutoPause(ctx, sb.TeamID, sb.ID, "transient_timeout_inferred", nil)
|
||||
case "snapshotting":
|
||||
// VM gone mid-snapshot; the sandbox is errored.
|
||||
m.audit.LogSnapshotCreateSystem(ctx, sb.TeamID, sb.ID, "transient_timeout", inferredErr)
|
||||
case "stopping":
|
||||
m.audit.LogSandboxDestroySystem(ctx, sb.TeamID, sb.ID, "transient_timeout_inferred", nil)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user