forked from wrenn/wrenn
Implement host registration, JWT refresh tokens, and multi-host scheduling
Replaces the hardcoded CP_HOST_AGENT_ADDR single-agent setup with a DB-driven registration system supporting multiple host agents (BYOC). Key changes: - Host agents register via one-time token, receive a 7-day JWT + 60-day refresh token; heartbeat loop auto-refreshes on 401/403 and pauses all sandboxes if refresh fails - HostClientPool: lazy Connect RPC client cache keyed by host ID, replacing the single static agent client throughout the API and service layers - RoundRobinScheduler: picks an online host for each new sandbox via ListActiveHosts; extensible for future scheduling strategies - HostMonitor (replaces Reconciler): passive heartbeat staleness check marks hosts unreachable and sandboxes missing after 90s; active reconciliation per online host restores missing-but-alive sandboxes and stops orphans - Graceful host delete: returns 409 with affected sandbox list without ?force=true; force-delete destroys sandboxes then evicts pool client - Snapshot delete broadcasts to all online hosts (templates have no host_id) - sandbox.Manager.PauseAll: pauses all running VMs on CP connectivity loss - New migration: host_refresh_tokens table with token rotation (issue-then- revoke ordering to prevent lockout on mid-rotation crash) - New sandbox status 'missing' (reversible, unlike 'stopped') and host status 'unreachable'; both reflected in OpenAPI spec - Fix: refresh token auth failure now returns 401 (was 400 via generic 'invalid' substring match in serviceErrToHTTP)
This commit is contained in:
77
internal/lifecycle/hostpool.go
Normal file
77
internal/lifecycle/hostpool.go
Normal file
@ -0,0 +1,77 @@
|
||||
package lifecycle
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net/http"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"git.omukk.dev/wrenn/sandbox/internal/db"
|
||||
"git.omukk.dev/wrenn/sandbox/proto/hostagent/gen/hostagentv1connect"
|
||||
)
|
||||
|
||||
// HostClientPool maintains a cache of Connect RPC clients keyed by host ID.
|
||||
// Clients are created lazily on first access and evicted when a host is removed
|
||||
// or goes unreachable. The pool is safe for concurrent use.
|
||||
type HostClientPool struct {
|
||||
mu sync.RWMutex
|
||||
clients map[string]hostagentv1connect.HostAgentServiceClient
|
||||
httpClient *http.Client
|
||||
}
|
||||
|
||||
// NewHostClientPool creates a new pool. The underlying HTTP client uses a
|
||||
// 10-minute timeout to support long-running streaming operations.
|
||||
func NewHostClientPool() *HostClientPool {
|
||||
return &HostClientPool{
|
||||
clients: make(map[string]hostagentv1connect.HostAgentServiceClient),
|
||||
httpClient: &http.Client{Timeout: 10 * time.Minute},
|
||||
}
|
||||
}
|
||||
|
||||
// Get returns a Connect RPC client for the given host, creating one if necessary.
|
||||
// address is the host agent address (ip:port or full URL). The scheme is added if absent.
|
||||
func (p *HostClientPool) Get(hostID, address string) hostagentv1connect.HostAgentServiceClient {
|
||||
p.mu.RLock()
|
||||
c, ok := p.clients[hostID]
|
||||
p.mu.RUnlock()
|
||||
if ok {
|
||||
return c
|
||||
}
|
||||
|
||||
p.mu.Lock()
|
||||
defer p.mu.Unlock()
|
||||
// Double-check after acquiring write lock.
|
||||
if c, ok = p.clients[hostID]; ok {
|
||||
return c
|
||||
}
|
||||
c = hostagentv1connect.NewHostAgentServiceClient(p.httpClient, ensureScheme(address))
|
||||
p.clients[hostID] = c
|
||||
return c
|
||||
}
|
||||
|
||||
// GetForHost is a convenience wrapper that extracts the address from a db.Host
|
||||
// and returns an error if the host has no address recorded yet.
|
||||
func (p *HostClientPool) GetForHost(h db.Host) (hostagentv1connect.HostAgentServiceClient, error) {
|
||||
if !h.Address.Valid || h.Address.String == "" {
|
||||
return nil, fmt.Errorf("host %s has no address", h.ID)
|
||||
}
|
||||
return p.Get(h.ID, h.Address.String), nil
|
||||
}
|
||||
|
||||
// Evict removes the cached client for the given host, forcing a new client to be
|
||||
// created on the next call to Get. Call this when a host's address changes or when
|
||||
// a host is deleted.
|
||||
func (p *HostClientPool) Evict(hostID string) {
|
||||
p.mu.Lock()
|
||||
delete(p.clients, hostID)
|
||||
p.mu.Unlock()
|
||||
}
|
||||
|
||||
// ensureScheme adds "http://" if the address has no scheme.
|
||||
func ensureScheme(addr string) string {
|
||||
if strings.HasPrefix(addr, "http://") || strings.HasPrefix(addr, "https://") {
|
||||
return addr
|
||||
}
|
||||
return "http://" + addr
|
||||
}
|
||||
Reference in New Issue
Block a user