Implement host registration, JWT refresh tokens, and multi-host scheduling

Replaces the hardcoded CP_HOST_AGENT_ADDR single-agent setup with a DB-driven registration system supporting multiple host agents (BYOC). Key changes: - Host agents register via one-time token, receive a 7-day JWT + 60-day refresh token; heartbeat loop auto-refreshes on 401/403 and pauses all sandboxes if refresh fails - HostClientPool: lazy Connect RPC client cache keyed by host ID, replacing the single static agent client throughout the API and service layers - RoundRobinScheduler: picks an online host for each new sandbox via ListActiveHosts; extensible for future scheduling strategies - HostMonitor (replaces Reconciler): passive heartbeat staleness check marks hosts unreachable and sandboxes missing after 90s; active reconciliation per online host restores missing-but-alive sandboxes and stops orphans - Graceful host delete: returns 409 with affected sandbox list without ?force=true; force-delete destroys sandboxes then evicts pool client - Snapshot delete broadcasts to all online hosts (templates have no host_id) - sandbox.Manager.PauseAll: pauses all running VMs on CP connectivity loss - New migration: host_refresh_tokens table with token rotation (issue-then- revoke ordering to prevent lockout on mid-rotation crash) - New sandbox status 'missing' (reversible, unlike 'stopped') and host status 'unreachable'; both reflected in OpenAPI spec - Fix: refresh token auth failure now returns 401 (was 400 via generic 'invalid' substring match in serviceErrToHTTP)
2026-03-24 18:32:05 +06:00
parent f968da9768
commit 9bf67aa7f7
33 changed files with 1567 additions and 318 deletions
--- a/internal/sandbox/manager.go
+++ b/internal/sandbox/manager.go
@ -1183,6 +1183,28 @@ func (m *Manager) Shutdown(ctx context.Context) {
 	m.loops.ReleaseAll()
 }

+// PauseAll pauses every running sandbox managed by this host agent.
+// Called when the host loses connectivity to the control plane to avoid
+// leaving running VMs unmanaged. It is best-effort: failures for individual
+// sandboxes are logged but do not stop the rest.
+func (m *Manager) PauseAll(ctx context.Context) {
+	m.mu.RLock()
+	ids := make([]string, 0, len(m.boxes))
+	for id, sb := range m.boxes {
+		if sb.Status == models.StatusRunning {
+			ids = append(ids, id)
+		}
+	}
+	m.mu.RUnlock()
+
+	slog.Info("pausing all running sandboxes due to CP connection loss", "count", len(ids))
+	for _, sbID := range ids {
+		if err := m.Pause(ctx, sbID); err != nil {
+			slog.Warn("PauseAll: failed to pause sandbox", "id", sbID, "error", err)
+		}
+	}
+}
+
 // warnErr logs a warning if err is non-nil. Used for best-effort cleanup
 // in error paths where the primary error has already been captured.
 func warnErr(msg string, id string, err error) {