From e069b3e67959f2cf30f6cef5b9a0928068107662 Mon Sep 17 00:00:00 2001 From: pptx704 Date: Wed, 25 Mar 2026 03:10:41 +0600 Subject: [PATCH] Add BYOC page, admin section, and is_byoc team visibility gating - Frontend: BYOC hosts page (/dashboard/byoc) with register/delete flows, shimmer loading, pulsing online status, animated token reveal checkmark - Frontend: Admin section (/admin/hosts) with platform + BYOC tabs, stat pills, skeleton loading, slide-in animations for new rows - Frontend: AdminSidebar component with accent top bar and admin pill badge - Frontend: BYOC nav item shown only when team.is_byoc is true (derived from teams store, not JWT); disabled for members - Frontend: Admin shield button in Sidebar, visible only to platform admins - Backend: is_admin in JWT claims + requireAdmin middleware (DB-validated) - Backend: is_byoc added to teamResponse so frontend derives visibility from fresh team data rather than stale JWT fields - Backend: SetBYOC admin endpoint (PUT /v1/admin/teams/{id}/byoc) - Backend: Admin hosts list enriches BYOC entries with team_name - Host agent: load .env file via godotenv on startup --- cmd/host-agent/main.go | 70 +- db/queries/hosts.sql | 3 +- frontend/src/lib/api/hosts.ts | 84 +++ frontend/src/lib/api/team.ts | 1 + frontend/src/lib/auth.svelte.ts | 25 + .../src/lib/components/AdminSidebar.svelte | 184 +++++ frontend/src/lib/components/Sidebar.svelte | 49 +- .../src/lib/components/icons/IconGear.svelte | 19 + .../lib/components/icons/IconServer.svelte | 21 + .../lib/components/icons/IconShield.svelte | 18 + frontend/src/lib/components/icons/index.ts | 3 + frontend/src/routes/admin/+layout.svelte | 7 + frontend/src/routes/admin/+layout.ts | 9 + frontend/src/routes/admin/+page.svelte | 5 + frontend/src/routes/admin/hosts/+page.svelte | 679 ++++++++++++++++++ .../src/routes/dashboard/byoc/+page.svelte | 587 +++++++++++++++ internal/api/handlers_auth.go | 6 +- internal/api/handlers_hosts.go | 31 +- internal/api/handlers_oauth.go | 6 +- internal/api/handlers_team.go | 30 +- internal/api/host_monitor.go | 4 + internal/api/middleware_admin.go | 30 + internal/api/middleware_jwt.go | 12 +- internal/api/server.go | 7 + internal/auth/context.go | 11 +- internal/auth/jwt.go | 22 +- internal/db/hosts.sql.go | 9 +- internal/hostagent/registration.go | 143 ++-- internal/hostagent/server.go | 20 +- internal/scheduler/round_robin.go | 40 +- internal/service/host.go | 25 +- internal/service/sandbox.go | 12 +- internal/service/team.go | 24 + proto/hostagent/gen/hostagent.pb.go | 120 +++- .../hostagentv1connect/hostagent.connect.go | 35 + proto/hostagent/hostagent.proto | 12 + 36 files changed, 2200 insertions(+), 163 deletions(-) create mode 100644 frontend/src/lib/api/hosts.ts create mode 100644 frontend/src/lib/components/AdminSidebar.svelte create mode 100644 frontend/src/lib/components/icons/IconGear.svelte create mode 100644 frontend/src/lib/components/icons/IconServer.svelte create mode 100644 frontend/src/lib/components/icons/IconShield.svelte create mode 100644 frontend/src/routes/admin/+layout.svelte create mode 100644 frontend/src/routes/admin/+layout.ts create mode 100644 frontend/src/routes/admin/+page.svelte create mode 100644 frontend/src/routes/admin/hosts/+page.svelte create mode 100644 frontend/src/routes/dashboard/byoc/+page.svelte create mode 100644 internal/api/middleware_admin.go diff --git a/cmd/host-agent/main.go b/cmd/host-agent/main.go index b33f886..2d34cd1 100644 --- a/cmd/host-agent/main.go +++ b/cmd/host-agent/main.go @@ -8,9 +8,12 @@ import ( "os" "os/signal" "path/filepath" + "sync" "syscall" "time" + "github.com/joho/godotenv" + "git.omukk.dev/wrenn/sandbox/internal/devicemapper" "git.omukk.dev/wrenn/sandbox/internal/hostagent" "git.omukk.dev/wrenn/sandbox/internal/sandbox" @@ -18,6 +21,9 @@ import ( ) func main() { + // Best-effort load — missing .env file is fine. + _ = godotenv.Load() + registrationToken := flag.String("register", "", "One-time registration token from the control plane (required on first run)") advertiseAddr := flag.String("address", "", "Externally-reachable address (ip:port) for this host agent") flag.Parse() @@ -87,41 +93,57 @@ func main() { slog.Info("host registered", "host_id", hostID) - // Start heartbeat loop. On CP rejection: try JWT refresh. If that fails, - // pause all running sandboxes to ensure they're not left orphaned. - hostagent.StartHeartbeat(ctx, cpURL, tokenFile, hostID, 30*time.Second, func() { - pauseCtx, cancel := context.WithTimeout(context.Background(), 2*time.Minute) - defer cancel() - mgr.PauseAll(pauseCtx) - }) + // httpServer is declared here so the shutdown func can reference it. + httpServer := &http.Server{Addr: listenAddr} - srv := hostagent.NewServer(mgr) + // doShutdown is the single shutdown path. sync.Once ensures mgr.Shutdown + // and httpServer.Shutdown are each called exactly once regardless of + // whether shutdown is triggered by a signal, a heartbeat 404, or the + // Terminate RPC. + var shutdownOnce sync.Once + doShutdown := func(reason string) { + shutdownOnce.Do(func() { + slog.Info("shutting down", "reason", reason) + cancel() + shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 30*time.Second) + defer shutdownCancel() + mgr.Shutdown(shutdownCtx) + if err := httpServer.Shutdown(shutdownCtx); err != nil { + slog.Error("http server shutdown error", "error", err) + } + }) + } + + srv := hostagent.NewServer(mgr, func() { + doShutdown("Terminate RPC received") + }) path, handler := hostagentv1connect.NewHostAgentServiceHandler(srv) mux := http.NewServeMux() mux.Handle(path, handler) + httpServer.Handler = mux - httpServer := &http.Server{ - Addr: listenAddr, - Handler: mux, - } + // Start heartbeat loop. Handler must be set before this because the + // immediate beat can trigger doShutdown → httpServer.Shutdown synchronously. + hostagent.StartHeartbeat(ctx, cpURL, tokenFile, hostID, 30*time.Second, + // pauseAll: called on 3 consecutive network failures. + func() { + pauseCtx, pauseCancel := context.WithTimeout(context.Background(), 2*time.Minute) + defer pauseCancel() + mgr.PauseAll(pauseCtx) + }, + // onDeleted: called when CP returns 404 (host was deleted). + func() { + doShutdown("host deleted from CP") + }, + ) - // Graceful shutdown on signal. + // Graceful shutdown on SIGINT/SIGTERM. sigCh := make(chan os.Signal, 1) signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM) go func() { sig := <-sigCh - slog.Info("received signal, shutting down", "signal", sig) - cancel() - - shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 30*time.Second) - defer shutdownCancel() - - mgr.Shutdown(shutdownCtx) - - if err := httpServer.Shutdown(shutdownCtx); err != nil { - slog.Error("http server shutdown error", "error", err) - } + doShutdown("signal: " + sig.String()) }() slog.Info("host agent starting", "addr", listenAddr, "host_id", hostID) diff --git a/db/queries/hosts.sql b/db/queries/hosts.sql index 2ffe0c1..27ece00 100644 --- a/db/queries/hosts.sql +++ b/db/queries/hosts.sql @@ -72,8 +72,9 @@ SELECT * FROM hosts WHERE id = $1 AND team_id = $2; -- Returns all hosts that have completed registration (not pending/offline). SELECT * FROM hosts WHERE status NOT IN ('pending', 'offline') ORDER BY created_at; --- name: UpdateHostHeartbeatAndStatus :exec +-- name: UpdateHostHeartbeatAndStatus :execrows -- Updates last_heartbeat_at and transitions unreachable hosts back to online. +-- Returns 0 if no host was found (deleted), which the caller treats as 404. UPDATE hosts SET last_heartbeat_at = NOW(), status = CASE WHEN status = 'unreachable' THEN 'online' ELSE status END, diff --git a/frontend/src/lib/api/hosts.ts b/frontend/src/lib/api/hosts.ts new file mode 100644 index 0000000..031b7f0 --- /dev/null +++ b/frontend/src/lib/api/hosts.ts @@ -0,0 +1,84 @@ +import { apiFetch } from './client'; + +export type Host = { + id: string; + type: 'regular' | 'byoc'; + team_id?: string; + team_name?: string; + provider?: string; + availability_zone?: string; + arch?: string; + cpu_cores?: number; + memory_mb?: number; + disk_gb?: number; + address?: string; + status: 'pending' | 'online' | 'offline' | 'unreachable' | 'draining'; + last_heartbeat_at?: string; + created_by: string; + created_at: string; + updated_at: string; +}; + +export type CreateHostParams = { + type: 'regular' | 'byoc'; + team_id?: string; + provider?: string; + availability_zone?: string; +}; + +export type CreateHostResult = { + host: Host; + registration_token: string; +}; + +export async function listHosts(): Promise<{ ok: true; data: Host[] } | { ok: false; error: string }> { + return apiFetch('GET', '/api/v1/hosts'); +} + +export async function createHost( + params: CreateHostParams +): Promise<{ ok: true; data: CreateHostResult } | { ok: false; error: string }> { + return apiFetch('POST', '/api/v1/hosts', params); +} + +export async function deleteHost( + id: string, + force = false +): Promise<{ ok: true } | { ok: false; error: string; sandbox_ids?: string[] }> { + const url = `/api/v1/hosts/${id}${force ? '?force=true' : ''}`; + const res = await apiFetch('DELETE', url); + if (!res.ok) { + return res as { ok: false; error: string }; + } + return { ok: true }; +} + +export async function getDeletePreview( + id: string +): Promise<{ ok: true; data: { host: Host; sandbox_ids: string[] } } | { ok: false; error: string }> { + return apiFetch<{ host: Host; sandbox_ids: string[] }>('GET', `/api/v1/hosts/${id}/delete-preview`); +} + +export function statusColor(status: Host['status']): string { + switch (status) { + case 'online': + return 'var(--color-accent)'; + case 'pending': + return 'var(--color-amber)'; + case 'offline': + case 'unreachable': + return 'var(--color-red)'; + case 'draining': + return 'var(--color-blue)'; + default: + return 'var(--color-text-muted)'; + } +} + +export function formatSpecs(host: Host): string { + const parts: string[] = []; + if (host.cpu_cores) parts.push(`${host.cpu_cores} vCPU`); + if (host.memory_mb) parts.push(`${Math.round(host.memory_mb / 1024)}GB RAM`); + if (host.disk_gb) parts.push(`${host.disk_gb}GB disk`); + return parts.join(' · ') || '—'; +} diff --git a/frontend/src/lib/api/team.ts b/frontend/src/lib/api/team.ts index a1ff935..0ffc4ed 100644 --- a/frontend/src/lib/api/team.ts +++ b/frontend/src/lib/api/team.ts @@ -29,6 +29,7 @@ export type TeamWithRole = { id: string; name: string; slug: string; + is_byoc: boolean; created_at: string; role: string; }; diff --git a/frontend/src/lib/auth.svelte.ts b/frontend/src/lib/auth.svelte.ts index b42cf52..d39a0f4 100644 --- a/frontend/src/lib/auth.svelte.ts +++ b/frontend/src/lib/auth.svelte.ts @@ -19,12 +19,23 @@ function isTokenExpired(token: string): boolean { } } +function decodeJWTPayload(token: string): Record { + try { + const payload = token.split('.')[1]; + return JSON.parse(atob(payload.replace(/-/g, '+').replace(/_/g, '/'))); + } catch { + return {}; + } +} + function createAuth() { let token = $state(null); let userId = $state(null); let teamId = $state(null); let email = $state(null); let name = $state(null); + let isAdmin = $state(false); + let role = $state('member'); let initialized = $state(false); // Initialize from localStorage synchronously at module load. @@ -36,6 +47,9 @@ function createAuth() { teamId = localStorage.getItem(STORAGE_KEYS.teamId); email = localStorage.getItem(STORAGE_KEYS.email); name = localStorage.getItem(STORAGE_KEYS.name); + const payload = decodeJWTPayload(stored); + isAdmin = Boolean(payload.is_admin); + role = String(payload.role || 'member'); } else if (stored) { // Expired — clean up. for (const key of Object.values(STORAGE_KEYS)) { @@ -63,6 +77,12 @@ function createAuth() { get name() { return name; }, + get isAdmin() { + return isAdmin; + }, + get role() { + return role; + }, get isAuthenticated() { return isAuthenticated; }, @@ -76,6 +96,9 @@ function createAuth() { teamId = data.team_id; email = data.email; name = data.name; + const payload = decodeJWTPayload(data.token); + isAdmin = Boolean(payload.is_admin); + role = String(payload.role || 'member'); localStorage.setItem(STORAGE_KEYS.token, data.token); localStorage.setItem(STORAGE_KEYS.userId, data.user_id); @@ -90,6 +113,8 @@ function createAuth() { teamId = null; email = null; name = null; + isAdmin = false; + role = 'member'; for (const key of Object.values(STORAGE_KEYS)) { localStorage.removeItem(key); diff --git a/frontend/src/lib/components/AdminSidebar.svelte b/frontend/src/lib/components/AdminSidebar.svelte new file mode 100644 index 0000000..4bed5cc --- /dev/null +++ b/frontend/src/lib/components/AdminSidebar.svelte @@ -0,0 +1,184 @@ + + + diff --git a/frontend/src/lib/components/Sidebar.svelte b/frontend/src/lib/components/Sidebar.svelte index 47feb54..c9a7afc 100644 --- a/frontend/src/lib/components/Sidebar.svelte +++ b/frontend/src/lib/components/Sidebar.svelte @@ -19,7 +19,9 @@ IconSidebar, IconBell, IconDocs, - IconAudit + IconAudit, + IconServer, + IconShield } from './icons'; let { collapsed = $bindable(false) }: { collapsed: boolean } = $props(); @@ -39,6 +41,8 @@ label: string; icon: typeof IconMonitor; href: string; + disabled?: boolean; + disabledHint?: string; }; const platformItems: NavItem[] = [ @@ -46,11 +50,24 @@ { label: 'Templates', icon: IconBox, href: '/dashboard/snapshots' } ]; - const managementItems: NavItem[] = [ + let currentTeamIsByoc = $derived( + teamsStore.list.find((t) => t.id === auth.teamId)?.is_byoc ?? false + ); + + let managementItems = $derived([ { label: 'Keys', icon: IconKey, href: '/dashboard/keys' }, { label: 'Team', icon: IconMembers, href: '/dashboard/team' }, - { label: 'Audit Logs', icon: IconAudit, href: '/dashboard/audit' } - ]; + { label: 'Audit Logs', icon: IconAudit, href: '/dashboard/audit' }, + ...(currentTeamIsByoc + ? [{ + label: 'BYOC', + icon: IconServer, + href: '/dashboard/byoc', + disabled: auth.role === 'member', + disabledHint: 'Available to team owners and admins only' + }] + : []) + ]); const billingItems: NavItem[] = [ { label: 'Usage', icon: IconUsage, href: '/dashboard/usage' }, @@ -232,6 +249,16 @@
+ {#if auth.isAdmin} + + + {#if !collapsed}Admin{/if} + + {/if} {/if} {#each items as item} - {#if isActive(item.href)} + {#if item.disabled} +
+ + {#if !collapsed} + {item.label} + {/if} +
+ {:else if isActive(item.href)}
+ let { size = 18, class: className = '' }: { size?: number; class?: string } = $props(); + + + diff --git a/frontend/src/lib/components/icons/IconServer.svelte b/frontend/src/lib/components/icons/IconServer.svelte new file mode 100644 index 0000000..c1ae7b9 --- /dev/null +++ b/frontend/src/lib/components/icons/IconServer.svelte @@ -0,0 +1,21 @@ + + + diff --git a/frontend/src/lib/components/icons/IconShield.svelte b/frontend/src/lib/components/icons/IconShield.svelte new file mode 100644 index 0000000..056bc16 --- /dev/null +++ b/frontend/src/lib/components/icons/IconShield.svelte @@ -0,0 +1,18 @@ + + + diff --git a/frontend/src/lib/components/icons/index.ts b/frontend/src/lib/components/icons/index.ts index 6296641..fa90069 100644 --- a/frontend/src/lib/components/icons/index.ts +++ b/frontend/src/lib/components/icons/index.ts @@ -23,3 +23,6 @@ export { default as IconBell } from './IconBell.svelte'; export { default as IconDocs } from './IconDocs.svelte'; export { default as IconAudit } from './IconAudit.svelte'; export { default as IconBox } from './IconBox.svelte'; +export { default as IconServer } from './IconServer.svelte'; +export { default as IconGear } from './IconGear.svelte'; +export { default as IconShield } from './IconShield.svelte'; diff --git a/frontend/src/routes/admin/+layout.svelte b/frontend/src/routes/admin/+layout.svelte new file mode 100644 index 0000000..599de61 --- /dev/null +++ b/frontend/src/routes/admin/+layout.svelte @@ -0,0 +1,7 @@ + + + +{@render children()} diff --git a/frontend/src/routes/admin/+layout.ts b/frontend/src/routes/admin/+layout.ts new file mode 100644 index 0000000..0f46f49 --- /dev/null +++ b/frontend/src/routes/admin/+layout.ts @@ -0,0 +1,9 @@ +import { browser } from '$app/environment'; +import { redirect } from '@sveltejs/kit'; +import { auth } from '$lib/auth.svelte'; + +export const load = () => { + if (!browser) return; + if (!auth.isAuthenticated) redirect(302, '/login'); + if (!auth.isAdmin) redirect(302, '/dashboard'); +}; diff --git a/frontend/src/routes/admin/+page.svelte b/frontend/src/routes/admin/+page.svelte new file mode 100644 index 0000000..b5a56c1 --- /dev/null +++ b/frontend/src/routes/admin/+page.svelte @@ -0,0 +1,5 @@ + diff --git a/frontend/src/routes/admin/hosts/+page.svelte b/frontend/src/routes/admin/hosts/+page.svelte new file mode 100644 index 0000000..16c7476 --- /dev/null +++ b/frontend/src/routes/admin/hosts/+page.svelte @@ -0,0 +1,679 @@ + + +
+ + +
+ +
+
+
+

+ Hosts +

+

+ Platform and BYOC compute across all teams. +

+
+ {#if activeTab === 'platform'} + + {/if} +
+ + + {#if !loading && !error} +
+
+ {totalCount} + total +
+
+ + + + + {onlineCount} + online +
+ {#if pendingCount > 0} +
+ {pendingCount} + pending +
+ {/if} +
+ {/if} +
+ + +
+ {#each [['platform', 'Platform', platformHosts.length], ['byoc', 'BYOC', byocHosts.length]] as [id, label, count] (id)} + + {/each} +
+ + +
+ {#if loading} + {@render skeletonRows()} + {:else if error} +
+ {error} +
+ {:else if activeTab === 'platform'} + {@render hostsTable(platformHosts, false)} + {:else} + + {#if byocHosts.length === 0} + {@render emptyState('byoc')} + {:else} +
+ {#each byocGroups as group (group.teamId ?? '__none__')} + {@const groupPageHosts = byocPageHosts.filter(h => h.team_id === group.teamId || (group.teamId === null && !h.team_id))} + {#if groupPageHosts.length > 0} +
+
+ + {group.teamName} + + + {group.hosts.length} + +
+ {@render hostsTable(groupPageHosts, false)} +
+ {/if} + {/each} + + + {#if byocPageCount > 1} +
+ + Page {byocPage + 1} of {byocPageCount} · {byocHosts.length} hosts + +
+ + +
+
+ {/if} +
+ {/if} + {/if} +
+
+
+ +{#snippet skeletonRows()} +
+ + + + + + + + + + + + {#each Array(5) as _, i} + + + + + + + + {/each} + +
HostStatus
+
+
+
+
+
+
+
+
+{/snippet} + +{#snippet hostsTable(hosts: Host[], _showTeam: boolean)} + {#if hosts.length === 0} + {@render emptyState('platform')} + {:else} +
+ + + + + + + + + + + + {#each hosts as host (host.id)} + + + + + + + + {/each} + +
HostStatus
+
{host.id}
+ {#if host.address} +
{host.address}
+ {/if} + {#if host.provider || host.availability_zone} +
+ {[host.provider, host.availability_zone].filter(Boolean).join(' · ')} +
+ {/if} +
+ + {#if host.status === 'online'} + + + + + {:else} + + {/if} + {host.status} + + + +
+
+ {/if} +{/snippet} + +{#snippet emptyState(type: 'platform' | 'byoc')} +
+
+ +
+

+ {type === 'platform' ? 'No platform hosts yet.' : 'No BYOC hosts across any team.'} +

+

+ {type === 'platform' + ? 'Add a host to start scheduling capsules onto your own compute.' + : 'Teams that register their own compute will appear here.'} +

+
+{/snippet} + + +{#if showCreate} +
+
{ if (!creating) showCreate = false; }} + onkeydown={(e) => { if (e.key === 'Escape' && !creating) showCreate = false; }} + >
+
+

+ Add Platform Host +

+

+ Register a new platform-managed host. You'll receive a one-time registration token. +

+ + {#if createError} +
+ {createError} +
+ {/if} + +
+
+ + +
+
+ + +
+
+ +
+ + +
+
+
+{/if} + + +{#if createdResult} +
+
+
+ +
+ + + +
+ +

+ Host registered +

+

+ Pass this token to the host agent to complete registration. It expires in + 1 hour and is single-use. +

+ +
+
+ + {createdResult.registration_token} + + +
+
+ +
+ +

+ This token will not be shown again. Store it safely before closing. +

+
+ +
+ +
+
+
+{/if} + + +{#if deleteTarget} +
+
{ if (!deleting) deleteTarget = null; }} + onkeydown={(e) => { if (e.key === 'Escape' && !deleting) deleteTarget = null; }} + >
+
+

+ Delete Host +

+

+ Permanently remove {deleteTarget.id}. +

+ + {#if deletePreviewLoading} +
+ + Checking active capsules… +
+ {:else if deletePreviewSandboxes.length > 0} +
+

+ {deletePreviewSandboxes.length} active capsule{deletePreviewSandboxes.length === 1 ? '' : 's'} will be destroyed. +

+

+ All running workloads on this host will be terminated immediately. +

+
+ {/if} + + {#if deleteError} +
+ {deleteError} +
+ {/if} + +
+ + +
+
+
+{/if} + + diff --git a/frontend/src/routes/dashboard/byoc/+page.svelte b/frontend/src/routes/dashboard/byoc/+page.svelte new file mode 100644 index 0000000..acd682f --- /dev/null +++ b/frontend/src/routes/dashboard/byoc/+page.svelte @@ -0,0 +1,587 @@ + + +
+ + +
+ +
+
+
+

+ BYOC Hosts +

+

+ Your own compute, running Wrenn capsules. +

+
+ {#if canManage} + + {/if} +
+ + + {#if !loading && !error && hosts.length > 0} +
+
+ {hosts.length} + total +
+
+ + + + + {onlineCount} + online +
+
+ {/if} +
+ + +
+ {#if loading} + {@render skeletonRows()} + {:else if error} +
+ {error} +
+ {:else if hosts.length === 0} + {@render emptyState()} + {:else} +
+ + + + + + + + + {#if canManage} + + {/if} + + + + {#each hosts as host (host.id)} + + + + + + + {#if canManage} + + {/if} + + {/each} + +
HostStatus
+
{host.id}
+ {#if host.address} +
{host.address}
+ {/if} + {#if host.provider || host.availability_zone} +
+ {[host.provider, host.availability_zone].filter(Boolean).join(' · ')} +
+ {/if} +
+ + {#if host.status === 'online'} + + + + + {:else} + + {/if} + {host.status} + + + +
+
+ {/if} +
+
+
+ +{#snippet skeletonRows()} +
+ + + + + + + + + + + + {#each Array(4) as _, i} + + + + + + + + {/each} + +
HostStatus
+
+
+
+
+
+
+{/snippet} + +{#snippet emptyState()} +
+
+ +
+ {#if canManage} +

+ No hosts yet. +

+

+ Register a server and Wrenn will schedule capsules on your own infrastructure. +

+ + {:else} +

+ No hosts registered. +

+

+ Ask a team owner or admin to register a BYOC host for your team. +

+ {/if} +
+{/snippet} + + +{#if showCreate} +
+
{ if (!creating) showCreate = false; }} + onkeydown={(e) => { if (e.key === 'Escape' && !creating) showCreate = false; }} + >
+
+

+ Register Host +

+

+ Add a server to your team's BYOC pool. You'll receive a one-time registration token. +

+ + {#if createError} +
+ {createError} +
+ {/if} + +
+
+ + +
+
+ + +
+
+ +
+ + +
+
+
+{/if} + + +{#if createdResult} +
+
+
+ +
+ + + +
+ +

+ Host registered +

+

+ Pass this token to the host agent to complete registration. It expires in + 1 hour and is single-use. +

+ +
+
+ + {createdResult.registration_token} + + +
+
+ +
+ +

+ This token will not be shown again. Store it safely before closing. +

+
+ +
+ +
+
+
+{/if} + + +{#if deleteTarget} +
+
{ if (!deleting) deleteTarget = null; }} + onkeydown={(e) => { if (e.key === 'Escape' && !deleting) deleteTarget = null; }} + >
+
+

+ Delete Host +

+

+ Remove {deleteTarget.id} from your BYOC pool. +

+ + {#if deletePreviewLoading} +
+ + Checking active capsules… +
+ {:else if deletePreviewSandboxes.length > 0} +
+

+ {deletePreviewSandboxes.length} active capsule{deletePreviewSandboxes.length === 1 ? '' : 's'} will be destroyed. +

+

+ All running workloads on this host will be terminated immediately. +

+
+ {/if} + + {#if deleteError} +
+ {deleteError} +
+ {/if} + +
+ + +
+
+
+{/if} + + diff --git a/internal/api/handlers_auth.go b/internal/api/handlers_auth.go index ae63883..ba60d8e 100644 --- a/internal/api/handlers_auth.go +++ b/internal/api/handlers_auth.go @@ -168,7 +168,7 @@ func (h *authHandler) Signup(w http.ResponseWriter, r *http.Request) { return } - token, err := auth.SignJWT(h.jwtSecret, userID, teamID, req.Email, req.Name, "owner") + token, err := auth.SignJWT(h.jwtSecret, userID, teamID, req.Email, req.Name, "owner", false) if err != nil { writeError(w, http.StatusInternalServerError, "internal_error", "failed to generate token") return @@ -228,7 +228,7 @@ func (h *authHandler) Login(w http.ResponseWriter, r *http.Request) { return } - token, err := auth.SignJWT(h.jwtSecret, user.ID, team.ID, user.Email, user.Name, role) + token, err := auth.SignJWT(h.jwtSecret, user.ID, team.ID, user.Email, user.Name, role, user.IsAdmin) if err != nil { writeError(w, http.StatusInternalServerError, "internal_error", "failed to generate token") return @@ -298,7 +298,7 @@ func (h *authHandler) SwitchTeam(w http.ResponseWriter, r *http.Request) { return } - token, err := auth.SignJWT(h.jwtSecret, ac.UserID, req.TeamID, ac.Email, user.Name, membership.Role) + token, err := auth.SignJWT(h.jwtSecret, ac.UserID, req.TeamID, ac.Email, user.Name, membership.Role, user.IsAdmin) if err != nil { writeError(w, http.StatusInternalServerError, "internal_error", "failed to generate token") return diff --git a/internal/api/handlers_hosts.go b/internal/api/handlers_hosts.go index 6b87594..762fc91 100644 --- a/internal/api/handlers_hosts.go +++ b/internal/api/handlers_hosts.go @@ -77,6 +77,7 @@ type hostResponse struct { ID string `json:"id"` Type string `json:"type"` TeamID *string `json:"team_id,omitempty"` + TeamName *string `json:"team_name,omitempty"` Provider *string `json:"provider,omitempty"` AvailabilityZone *string `json:"availability_zone,omitempty"` Arch *string `json:"arch,omitempty"` @@ -174,16 +175,41 @@ func (h *hostHandler) Create(w http.ResponseWriter, r *http.Request) { // List handles GET /v1/hosts. func (h *hostHandler) List(w http.ResponseWriter, r *http.Request) { ac := auth.MustFromContext(r.Context()) + admin := h.isAdmin(r, ac.UserID) - hosts, err := h.svc.List(r.Context(), ac.TeamID, h.isAdmin(r, ac.UserID)) + hosts, err := h.svc.List(r.Context(), ac.TeamID, admin) if err != nil { writeError(w, http.StatusInternalServerError, "db_error", "failed to list hosts") return } + // Collect unique team IDs so we can fetch team names in one pass. + var teamNames map[string]string + if admin { + seen := make(map[string]struct{}) + for _, host := range hosts { + if host.TeamID.Valid { + seen[host.TeamID.String] = struct{}{} + } + } + if len(seen) > 0 { + teamNames = make(map[string]string, len(seen)) + for id := range seen { + if team, err := h.queries.GetTeam(r.Context(), id); err == nil { + teamNames[id] = team.Name + } + } + } + } + resp := make([]hostResponse, len(hosts)) for i, host := range hosts { resp[i] = hostToResponse(host) + if host.TeamID.Valid { + if name, ok := teamNames[host.TeamID.String]; ok { + resp[i].TeamName = &name + } + } } writeJSON(w, http.StatusOK, resp) @@ -322,7 +348,8 @@ func (h *hostHandler) Heartbeat(w http.ResponseWriter, r *http.Request) { } if err := h.svc.Heartbeat(r.Context(), hc.HostID); err != nil { - writeError(w, http.StatusInternalServerError, "db_error", "failed to update heartbeat") + status, code, msg := serviceErrToHTTP(err) + writeError(w, status, code, msg) return } diff --git a/internal/api/handlers_oauth.go b/internal/api/handlers_oauth.go index 1c72285..348dd85 100644 --- a/internal/api/handlers_oauth.go +++ b/internal/api/handlers_oauth.go @@ -156,7 +156,7 @@ func (h *oauthHandler) Callback(w http.ResponseWriter, r *http.Request) { redirectWithError(w, r, redirectBase, "db_error") return } - token, err := auth.SignJWT(h.jwtSecret, user.ID, team.ID, user.Email, user.Name, role) + token, err := auth.SignJWT(h.jwtSecret, user.ID, team.ID, user.Email, user.Name, role, user.IsAdmin) if err != nil { slog.Error("oauth login: failed to sign jwt", "error", err) redirectWithError(w, r, redirectBase, "internal_error") @@ -255,7 +255,7 @@ func (h *oauthHandler) Callback(w http.ResponseWriter, r *http.Request) { return } - token, err := auth.SignJWT(h.jwtSecret, userID, teamID, email, profile.Name, "owner") + token, err := auth.SignJWT(h.jwtSecret, userID, teamID, email, profile.Name, "owner", false) if err != nil { slog.Error("oauth: failed to sign jwt", "error", err) redirectWithError(w, r, redirectBase, "internal_error") @@ -290,7 +290,7 @@ func (h *oauthHandler) retryAsLogin(w http.ResponseWriter, r *http.Request, prov redirectWithError(w, r, redirectBase, "db_error") return } - token, err := auth.SignJWT(h.jwtSecret, user.ID, team.ID, user.Email, user.Name, role) + token, err := auth.SignJWT(h.jwtSecret, user.ID, team.ID, user.Email, user.Name, role, user.IsAdmin) if err != nil { slog.Error("oauth: retry login: failed to sign jwt", "error", err) redirectWithError(w, r, redirectBase, "internal_error") diff --git a/internal/api/handlers_team.go b/internal/api/handlers_team.go index e852583..fcb5564 100644 --- a/internal/api/handlers_team.go +++ b/internal/api/handlers_team.go @@ -25,6 +25,7 @@ type teamResponse struct { ID string `json:"id"` Name string `json:"name"` Slug string `json:"slug"` + IsByoc bool `json:"is_byoc"` CreatedAt string `json:"created_at"` } @@ -44,9 +45,10 @@ type memberResponse struct { func teamToResponse(t db.Team) teamResponse { resp := teamResponse{ - ID: t.ID, - Name: t.Name, - Slug: t.Slug, + ID: t.ID, + Name: t.Name, + Slug: t.Slug, + IsByoc: t.IsByoc, } if t.CreatedAt.Valid { resp.CreatedAt = t.CreatedAt.Time.Format(time.RFC3339) @@ -321,3 +323,25 @@ func (h *teamHandler) Leave(w http.ResponseWriter, r *http.Request) { w.WriteHeader(http.StatusNoContent) } + +// SetBYOC handles PUT /v1/admin/teams/{id}/byoc (admin only). +// Enables or disables the BYOC feature flag for a team. +func (h *teamHandler) SetBYOC(w http.ResponseWriter, r *http.Request) { + teamID := chi.URLParam(r, "id") + + var req struct { + Enabled bool `json:"enabled"` + } + if err := decodeJSON(r, &req); err != nil { + writeError(w, http.StatusBadRequest, "invalid_request", "invalid JSON body") + return + } + + if err := h.svc.SetBYOC(r.Context(), teamID, req.Enabled); err != nil { + status, code, msg := serviceErrToHTTP(err) + writeError(w, status, code, msg) + return + } + + w.WriteHeader(http.StatusNoContent) +} diff --git a/internal/api/host_monitor.go b/internal/api/host_monitor.go index 6e64300..e2afca1 100644 --- a/internal/api/host_monitor.go +++ b/internal/api/host_monitor.go @@ -45,6 +45,10 @@ func (m *HostMonitor) Start(ctx context.Context) { ticker := time.NewTicker(m.interval) defer ticker.Stop() + // Run immediately on startup so the CP doesn't wait one full interval + // before reconciling host and sandbox state. + m.run(ctx) + for { select { case <-ctx.Done(): diff --git a/internal/api/middleware_admin.go b/internal/api/middleware_admin.go new file mode 100644 index 0000000..0685896 --- /dev/null +++ b/internal/api/middleware_admin.go @@ -0,0 +1,30 @@ +package api + +import ( + "net/http" + + "git.omukk.dev/wrenn/sandbox/internal/auth" + "git.omukk.dev/wrenn/sandbox/internal/db" +) + +// requireAdmin validates that the authenticated user is a platform admin. +// Must run after requireJWT (depends on AuthContext being present). +// Re-validates against the DB — the JWT is_admin claim is for UI only; +// the DB is the source of truth for admin access. +func requireAdmin(queries *db.Queries) func(http.Handler) http.Handler { + return func(next http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + ac, ok := auth.FromContext(r.Context()) + if !ok { + writeError(w, http.StatusUnauthorized, "unauthorized", "authentication required") + return + } + user, err := queries.GetUserByID(r.Context(), ac.UserID) + if err != nil || !user.IsAdmin { + writeError(w, http.StatusForbidden, "forbidden", "admin access required") + return + } + next.ServeHTTP(w, r) + }) + } +} diff --git a/internal/api/middleware_jwt.go b/internal/api/middleware_jwt.go index c0b17fa..96b1c68 100644 --- a/internal/api/middleware_jwt.go +++ b/internal/api/middleware_jwt.go @@ -26,12 +26,14 @@ func requireJWT(secret []byte) func(http.Handler) http.Handler { } ctx := auth.WithAuthContext(r.Context(), auth.AuthContext{ - TeamID: claims.TeamID, - UserID: claims.Subject, - Email: claims.Email, - Name: claims.Name, - Role: claims.Role, + TeamID: claims.TeamID, + UserID: claims.Subject, + Email: claims.Email, + Name: claims.Name, + Role: claims.Role, + IsAdmin: claims.IsAdmin, }) + next.ServeHTTP(w, r.WithContext(ctx)) }) } diff --git a/internal/api/server.go b/internal/api/server.go index 45148c7..302aee3 100644 --- a/internal/api/server.go +++ b/internal/api/server.go @@ -156,6 +156,13 @@ func New( }) }) + // Platform admin routes — require JWT + DB-validated admin status. + r.Route("/v1/admin", func(r chi.Router) { + r.Use(requireJWT(jwtSecret)) + r.Use(requireAdmin(queries)) + r.Put("/teams/{id}/byoc", teamH.SetBYOC) + }) + return &Server{router: r} } diff --git a/internal/auth/context.go b/internal/auth/context.go index 36dd06c..98db360 100644 --- a/internal/auth/context.go +++ b/internal/auth/context.go @@ -8,11 +8,12 @@ const authCtxKey contextKey = 0 // AuthContext is stamped into request context by auth middleware. type AuthContext struct { - TeamID string - UserID string // empty when authenticated via API key - Email string // empty when authenticated via API key - Name string // empty when authenticated via API key - Role string // owner, admin, or member; empty when authenticated via API key + TeamID string + UserID string // empty when authenticated via API key + Email string // empty when authenticated via API key + Name string // empty when authenticated via API key + Role string // owner, admin, or member; empty when authenticated via API key + IsAdmin bool // platform-level admin; always false when authenticated via API key } // WithAuthContext returns a new context with the given AuthContext. diff --git a/internal/auth/jwt.go b/internal/auth/jwt.go index dccd02d..a40a032 100644 --- a/internal/auth/jwt.go +++ b/internal/auth/jwt.go @@ -13,22 +13,24 @@ const HostRefreshTokenExpiry = 60 * 24 * time.Hour // 60 days; exported for serv // Claims are the JWT payload for user tokens. type Claims struct { - Type string `json:"typ,omitempty"` // empty for user tokens; used to reject host tokens - TeamID string `json:"team_id"` - Role string `json:"role"` // owner, admin, or member within TeamID - Email string `json:"email"` - Name string `json:"name"` + Type string `json:"typ,omitempty"` // empty for user tokens; used to reject host tokens + TeamID string `json:"team_id"` + Role string `json:"role"` // owner, admin, or member within TeamID + Email string `json:"email"` + Name string `json:"name"` + IsAdmin bool `json:"is_admin,omitempty"` // platform-level admin flag jwt.RegisteredClaims } // SignJWT signs a new 6-hour JWT for the given user. -func SignJWT(secret []byte, userID, teamID, email, name, role string) (string, error) { +func SignJWT(secret []byte, userID, teamID, email, name, role string, isAdmin bool) (string, error) { now := time.Now() claims := Claims{ - TeamID: teamID, - Role: role, - Email: email, - Name: name, + TeamID: teamID, + Role: role, + Email: email, + Name: name, + IsAdmin: isAdmin, RegisteredClaims: jwt.RegisteredClaims{ Subject: userID, IssuedAt: jwt.NewNumericDate(now), diff --git a/internal/db/hosts.sql.go b/internal/db/hosts.sql.go index 12524c0..90d97ca 100644 --- a/internal/db/hosts.sql.go +++ b/internal/db/hosts.sql.go @@ -574,7 +574,7 @@ func (q *Queries) UpdateHostHeartbeat(ctx context.Context, id string) error { return err } -const updateHostHeartbeatAndStatus = `-- name: UpdateHostHeartbeatAndStatus :exec +const updateHostHeartbeatAndStatus = `-- name: UpdateHostHeartbeatAndStatus :execrows UPDATE hosts SET last_heartbeat_at = NOW(), status = CASE WHEN status = 'unreachable' THEN 'online' ELSE status END, @@ -583,9 +583,10 @@ WHERE id = $1 ` // Updates last_heartbeat_at and transitions unreachable hosts back to online. -func (q *Queries) UpdateHostHeartbeatAndStatus(ctx context.Context, id string) error { - _, err := q.db.Exec(ctx, updateHostHeartbeatAndStatus, id) - return err +// Returns 0 if no host was found (deleted). +func (q *Queries) UpdateHostHeartbeatAndStatus(ctx context.Context, id string) (int64, error) { + result, err := q.db.Exec(ctx, updateHostHeartbeatAndStatus, id) + return result.RowsAffected(), err } const updateHostStatus = `-- name: UpdateHostStatus :exec diff --git a/internal/hostagent/registration.go b/internal/hostagent/registration.go index e039462..9f39c3b 100644 --- a/internal/hostagent/registration.go +++ b/internal/hostagent/registration.go @@ -96,13 +96,14 @@ func saveTokenFile(path string, tf tokenFile) error { // Register calls the control plane to register this host agent and persists // the returned JWT and refresh token to disk. Returns the host JWT token string. func Register(ctx context.Context, cfg RegistrationConfig) (string, error) { - // Check if we already have a saved token. - if tf, err := loadTokenFile(cfg.TokenFile); err == nil && tf.JWT != "" { - slog.Info("loaded existing host token", "file", cfg.TokenFile, "host_id", tf.HostID) - return tf.JWT, nil - } - + // If no explicit registration token was given, reuse the saved JWT. + // A --register flag always overrides the local file so operators can + // force re-registration without manually deleting host.jwt. if cfg.RegistrationToken == "" { + if tf, err := loadTokenFile(cfg.TokenFile); err == nil && tf.JWT != "" { + slog.Info("loaded existing host token", "file", cfg.TokenFile, "host_id", tf.HostID) + return tf.JWT, nil + } return "", fmt.Errorf("no saved host token and no registration token provided (use --register flag)") } @@ -239,7 +240,11 @@ func RefreshJWT(ctx context.Context, cpURL, tokenFilePath string) (string, error // // On repeated network failures (3 consecutive), it calls pauseAll but keeps // retrying — the connection may recover and the host should resume heartbeating. -func StartHeartbeat(ctx context.Context, cpURL, tokenFilePath, hostID string, interval time.Duration, pauseAll func()) { +// +// onDeleted is called when CP returns 404, meaning this host record was deleted. +// The token file is removed before calling onDeleted so subsequent starts prompt +// for a new registration token. +func StartHeartbeat(ctx context.Context, cpURL, tokenFilePath, hostID string, interval time.Duration, pauseAll func(), onDeleted func()) { client := &http.Client{Timeout: 10 * time.Second} go func() { @@ -255,62 +260,84 @@ func StartHeartbeat(ctx context.Context, cpURL, tokenFilePath, hostID string, in currentJWT = tf.JWT } + // beat sends one heartbeat. Returns true if the loop should stop. + beat := func() (stop bool) { + url := strings.TrimRight(cpURL, "/") + "/v1/hosts/" + hostID + "/heartbeat" + req, err := http.NewRequestWithContext(ctx, http.MethodPost, url, nil) + if err != nil { + slog.Warn("heartbeat: failed to create request", "error", err) + return false + } + req.Header.Set("X-Host-Token", currentJWT) + + resp, err := client.Do(req) + if err != nil { + consecutiveFailures++ + slog.Warn("heartbeat: request failed", "error", err, "consecutive_failures", consecutiveFailures) + if consecutiveFailures >= 3 && !pausedDueToFailure { + slog.Error("heartbeat: CP unreachable after 3 failures — pausing all sandboxes") + if pauseAll != nil { + pauseAll() + } + pausedDueToFailure = true + } + return false + } + resp.Body.Close() + + switch resp.StatusCode { + case http.StatusNoContent: + if consecutiveFailures > 0 || pausedDueToFailure { + slog.Info("heartbeat: CP connection restored") + } + consecutiveFailures = 0 + pausedDueToFailure = false + + case http.StatusUnauthorized, http.StatusForbidden: + slog.Warn("heartbeat: JWT rejected — attempting token refresh") + newJWT, refreshErr := RefreshJWT(ctx, cpURL, tokenFilePath) + if refreshErr != nil { + slog.Error("heartbeat: JWT refresh failed — pausing all sandboxes; manual re-registration required", + "error", refreshErr) + if pauseAll != nil && !pausedDueToFailure { + pauseAll() + pausedDueToFailure = true + } + // Stop the heartbeat loop — operator must re-register. + return true + } + currentJWT = newJWT + slog.Info("heartbeat: JWT refreshed successfully") + + case http.StatusNotFound: + slog.Error("heartbeat: host no longer exists in CP — host was deleted; removing token file and exiting") + if err := os.Remove(tokenFilePath); err != nil && !os.IsNotExist(err) { + slog.Warn("heartbeat: failed to remove token file", "error", err) + } + if onDeleted != nil { + onDeleted() + } + return true + + default: + slog.Warn("heartbeat: unexpected status", "status", resp.StatusCode) + } + return false + } + + // Send an immediate heartbeat on startup so the CP sees the host as + // online without waiting for the first ticker tick. + if beat() { + return + } + for { select { case <-ctx.Done(): return case <-ticker.C: - url := strings.TrimRight(cpURL, "/") + "/v1/hosts/" + hostID + "/heartbeat" - req, err := http.NewRequestWithContext(ctx, http.MethodPost, url, nil) - if err != nil { - slog.Warn("heartbeat: failed to create request", "error", err) - continue - } - req.Header.Set("X-Host-Token", currentJWT) - - resp, err := client.Do(req) - if err != nil { - consecutiveFailures++ - slog.Warn("heartbeat: request failed", "error", err, "consecutive_failures", consecutiveFailures) - - if consecutiveFailures >= 3 && !pausedDueToFailure { - slog.Error("heartbeat: CP unreachable after 3 failures — pausing all sandboxes") - if pauseAll != nil { - pauseAll() - } - pausedDueToFailure = true - } - continue - } - resp.Body.Close() - - switch resp.StatusCode { - case http.StatusNoContent: - // Success. - if consecutiveFailures > 0 || pausedDueToFailure { - slog.Info("heartbeat: CP connection restored") - } - consecutiveFailures = 0 - pausedDueToFailure = false - - case http.StatusUnauthorized, http.StatusForbidden: - slog.Warn("heartbeat: JWT rejected — attempting token refresh") - newJWT, refreshErr := RefreshJWT(ctx, cpURL, tokenFilePath) - if refreshErr != nil { - slog.Error("heartbeat: JWT refresh failed — pausing all sandboxes; manual re-registration required", - "error", refreshErr) - if pauseAll != nil && !pausedDueToFailure { - pauseAll() - pausedDueToFailure = true - } - // Stop the heartbeat loop — operator must re-register. - return - } - currentJWT = newJWT - slog.Info("heartbeat: JWT refreshed successfully") - - default: - slog.Warn("heartbeat: unexpected status", "status", resp.StatusCode) + if beat() { + return } } } diff --git a/internal/hostagent/server.go b/internal/hostagent/server.go index d545e59..c0a4cfd 100644 --- a/internal/hostagent/server.go +++ b/internal/hostagent/server.go @@ -22,12 +22,15 @@ import ( // Server implements the HostAgentService Connect RPC handler. type Server struct { hostagentv1connect.UnimplementedHostAgentServiceHandler - mgr *sandbox.Manager + mgr *sandbox.Manager + terminate func() // called when the CP requests agent termination } // NewServer creates a new host agent RPC server. -func NewServer(mgr *sandbox.Manager) *Server { - return &Server{mgr: mgr} +// terminate is invoked (in a goroutine) when the CP calls the Terminate RPC, +// allowing main to perform a clean shutdown. +func NewServer(mgr *sandbox.Manager, terminate func()) *Server { + return &Server{mgr: mgr, terminate: terminate} } func (s *Server) CreateSandbox( @@ -412,3 +415,14 @@ func (s *Server) ListSandboxes( AutoPausedSandboxIds: s.mgr.DrainAutoPausedIDs(), }), nil } + +func (s *Server) Terminate( + _ context.Context, + _ *connect.Request[pb.TerminateRequest], +) (*connect.Response[pb.TerminateResponse], error) { + slog.Info("terminate RPC received — scheduling shutdown") + if s.terminate != nil { + go s.terminate() + } + return connect.NewResponse(&pb.TerminateResponse{}), nil +} diff --git a/internal/scheduler/round_robin.go b/internal/scheduler/round_robin.go index 52e0330..31433a0 100644 --- a/internal/scheduler/round_robin.go +++ b/internal/scheduler/round_robin.go @@ -12,11 +12,13 @@ import ( // different strategies (round-robin, least-loaded, tag-based, etc.). type HostScheduler interface { // SelectHost returns a host that can accept a new sandbox. - // Returns an error if no suitable host is available. - SelectHost(ctx context.Context) (db.Host, error) + // For BYOC teams (isByoc=true), only online BYOC hosts belonging to teamID + // are considered. For non-BYOC teams, only online regular (platform) hosts + // are considered. Returns an error if no suitable host is available. + SelectHost(ctx context.Context, teamID string, isByoc bool) (db.Host, error) } -// RoundRobinScheduler cycles through online hosts in round-robin order. +// RoundRobinScheduler cycles through eligible online hosts in round-robin order. // It re-fetches the host list on every call so that newly registered or // recovered hosts are considered immediately. type RoundRobinScheduler struct { @@ -29,23 +31,39 @@ func NewRoundRobinScheduler(queries *db.Queries) *RoundRobinScheduler { return &RoundRobinScheduler{db: queries} } -// SelectHost returns the next online host in round-robin order. -func (s *RoundRobinScheduler) SelectHost(ctx context.Context) (db.Host, error) { +// SelectHost returns the next eligible online host in round-robin order. +func (s *RoundRobinScheduler) SelectHost(ctx context.Context, teamID string, isByoc bool) (db.Host, error) { hosts, err := s.db.ListActiveHosts(ctx) if err != nil { return db.Host{}, fmt.Errorf("list hosts: %w", err) } - var online []db.Host + var eligible []db.Host for _, h := range hosts { - if h.Status == "online" && h.Address.Valid && h.Address.String != "" { - online = append(online, h) + if h.Status != "online" || !h.Address.Valid || h.Address.String == "" { + continue } + if isByoc { + // BYOC team: only use hosts belonging to this team. + if h.Type != "byoc" || !h.TeamID.Valid || h.TeamID.String != teamID { + continue + } + } else { + // Non-BYOC team: only use platform (regular) hosts. + if h.Type != "regular" { + continue + } + } + eligible = append(eligible, h) } - if len(online) == 0 { - return db.Host{}, fmt.Errorf("no online hosts available") + + if len(eligible) == 0 { + if isByoc { + return db.Host{}, fmt.Errorf("no online BYOC hosts available for team") + } + return db.Host{}, fmt.Errorf("no online platform hosts available") } idx := s.counter.Add(1) - 1 - return online[int(idx%int64(len(online)))], nil + return eligible[int(idx%int64(len(eligible)))], nil } diff --git a/internal/service/host.go b/internal/service/host.go index a9ba9ff..b3538df 100644 --- a/internal/service/host.go +++ b/internal/service/host.go @@ -123,12 +123,15 @@ func (s *HostService) Create(ctx context.Context, p HostCreateParams) (HostCreat } } - // Validate team exists and is not deleted for BYOC hosts. + // Validate team exists, is not deleted, and has BYOC enabled. if p.TeamID != "" { team, err := s.DB.GetTeam(ctx, p.TeamID) if err != nil || team.DeletedAt.Valid { return HostCreateResult{}, fmt.Errorf("invalid request: team not found") } + if !team.IsByoc { + return HostCreateResult{}, fmt.Errorf("forbidden: BYOC is not enabled for this team") + } } hostID := id.NewHostID() @@ -370,9 +373,17 @@ func hashToken(token string) string { } // Heartbeat updates the last heartbeat timestamp for a host and transitions -// any 'unreachable' host back to 'online'. +// any 'unreachable' host back to 'online'. Returns a "host not found" error +// (which becomes 404) if the host record no longer exists (e.g., was deleted). func (s *HostService) Heartbeat(ctx context.Context, hostID string) error { - return s.DB.UpdateHostHeartbeatAndStatus(ctx, hostID) + n, err := s.DB.UpdateHostHeartbeatAndStatus(ctx, hostID) + if err != nil { + return err + } + if n == 0 { + return fmt.Errorf("host not found") + } + return nil } // List returns hosts visible to the caller. @@ -447,8 +458,8 @@ func (s *HostService) Delete(ctx context.Context, hostID, userID, teamID string, return &HostHasSandboxesError{SandboxIDs: ids} } - // Gracefully destroy running sandboxes on the host agent (best-effort). - if len(sandboxes) > 0 && host.Address.Valid && host.Address.String != "" { + // Gracefully destroy running sandboxes and terminate the agent (best-effort). + if host.Address.Valid && host.Address.String != "" { agent, err := s.Pool.GetForHost(host) if err == nil { for _, sb := range sandboxes { @@ -461,6 +472,10 @@ func (s *HostService) Delete(ctx context.Context, hostID, userID, teamID string, } } } + // Tell the agent to shut itself down immediately. + if _, rpcErr := agent.Terminate(ctx, connect.NewRequest(&pb.TerminateRequest{})); rpcErr != nil { + slog.Warn("delete host: failed to send Terminate to agent", "host_id", hostID, "error", rpcErr) + } } } diff --git a/internal/service/sandbox.go b/internal/service/sandbox.go index 0c7c08f..f67eb0d 100644 --- a/internal/service/sandbox.go +++ b/internal/service/sandbox.go @@ -86,8 +86,18 @@ func (s *SandboxService) Create(ctx context.Context, p SandboxCreateParams) (db. } } + if p.TeamID == "" { + return db.Sandbox{}, fmt.Errorf("invalid request: team_id is required") + } + + // Determine whether this team uses BYOC hosts or platform hosts. + team, err := s.DB.GetTeam(ctx, p.TeamID) + if err != nil { + return db.Sandbox{}, fmt.Errorf("team not found: %w", err) + } + // Pick a host for this sandbox. - host, err := s.Scheduler.SelectHost(ctx) + host, err := s.Scheduler.SelectHost(ctx, p.TeamID, team.IsByoc) if err != nil { return db.Sandbox{}, fmt.Errorf("select host: %w", err) } diff --git a/internal/service/team.go b/internal/service/team.go index 9d2bb4c..859441e 100644 --- a/internal/service/team.go +++ b/internal/service/team.go @@ -374,3 +374,27 @@ func (s *TeamService) LeaveTeam(ctx context.Context, teamID, callerUserID string func (s *TeamService) SearchUsersByEmailPrefix(ctx context.Context, prefix string) ([]db.SearchUsersByEmailPrefixRow, error) { return s.DB.SearchUsersByEmailPrefix(ctx, pgtype.Text{String: prefix, Valid: true}) } + +// SetBYOC enables the BYOC feature flag for a team. Once enabled, BYOC cannot +// be disabled — it is a one-way transition. +// Admin-only — the caller must verify admin status before invoking this. +func (s *TeamService) SetBYOC(ctx context.Context, teamID string, enabled bool) error { + team, err := s.DB.GetTeam(ctx, teamID) + if err != nil { + return fmt.Errorf("team not found: %w", err) + } + if team.DeletedAt.Valid { + return fmt.Errorf("team not found") + } + if !enabled { + return fmt.Errorf("invalid request: BYOC cannot be disabled once enabled") + } + if team.IsByoc { + // Already enabled — idempotent, no-op. + return nil + } + if err := s.DB.SetTeamBYOC(ctx, db.SetTeamBYOCParams{ID: teamID, IsByoc: true}); err != nil { + return fmt.Errorf("set byoc: %w", err) + } + return nil +} diff --git a/proto/hostagent/gen/hostagent.pb.go b/proto/hostagent/gen/hostagent.pb.go index 447f1f7..7afd4d1 100644 --- a/proto/hostagent/gen/hostagent.pb.go +++ b/proto/hostagent/gen/hostagent.pb.go @@ -1830,6 +1830,78 @@ func (*PingSandboxResponse) Descriptor() ([]byte, []int) { return file_hostagent_proto_rawDescGZIP(), []int{32} } +type TerminateRequest struct { + state protoimpl.MessageState `protogen:"open.v1"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *TerminateRequest) Reset() { + *x = TerminateRequest{} + mi := &file_hostagent_proto_msgTypes[33] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *TerminateRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*TerminateRequest) ProtoMessage() {} + +func (x *TerminateRequest) ProtoReflect() protoreflect.Message { + mi := &file_hostagent_proto_msgTypes[33] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use TerminateRequest.ProtoReflect.Descriptor instead. +func (*TerminateRequest) Descriptor() ([]byte, []int) { + return file_hostagent_proto_rawDescGZIP(), []int{33} +} + +type TerminateResponse struct { + state protoimpl.MessageState `protogen:"open.v1"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *TerminateResponse) Reset() { + *x = TerminateResponse{} + mi := &file_hostagent_proto_msgTypes[34] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *TerminateResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*TerminateResponse) ProtoMessage() {} + +func (x *TerminateResponse) ProtoReflect() protoreflect.Message { + mi := &file_hostagent_proto_msgTypes[34] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use TerminateResponse.ProtoReflect.Descriptor instead. +func (*TerminateResponse) Descriptor() ([]byte, []int) { + return file_hostagent_proto_rawDescGZIP(), []int{34} +} + var File_hostagent_proto protoreflect.FileDescriptor const file_hostagent_proto_rawDesc = "" + @@ -1955,7 +2027,10 @@ const file_hostagent_proto_rawDesc = "" + "\x12PingSandboxRequest\x12\x1d\n" + "\n" + "sandbox_id\x18\x01 \x01(\tR\tsandboxId\"\x15\n" + - "\x13PingSandboxResponse2\xce\t\n" + + "\x13PingSandboxResponse\"\x12\n" + + "\x10TerminateRequest\"\x13\n" + + "\x11TerminateResponse2\x9c\n" + + "\n" + "\x10HostAgentService\x12X\n" + "\rCreateSandbox\x12\".hostagent.v1.CreateSandboxRequest\x1a#.hostagent.v1.CreateSandboxResponse\x12[\n" + "\x0eDestroySandbox\x12#.hostagent.v1.DestroySandboxRequest\x1a$.hostagent.v1.DestroySandboxResponse\x12U\n" + @@ -1971,7 +2046,8 @@ const file_hostagent_proto_rawDesc = "" + "ExecStream\x12\x1f.hostagent.v1.ExecStreamRequest\x1a .hostagent.v1.ExecStreamResponse0\x01\x12`\n" + "\x0fWriteFileStream\x12$.hostagent.v1.WriteFileStreamRequest\x1a%.hostagent.v1.WriteFileStreamResponse(\x01\x12]\n" + "\x0eReadFileStream\x12#.hostagent.v1.ReadFileStreamRequest\x1a$.hostagent.v1.ReadFileStreamResponse0\x01\x12R\n" + - "\vPingSandbox\x12 .hostagent.v1.PingSandboxRequest\x1a!.hostagent.v1.PingSandboxResponseB\xb0\x01\n" + + "\vPingSandbox\x12 .hostagent.v1.PingSandboxRequest\x1a!.hostagent.v1.PingSandboxResponse\x12L\n" + + "\tTerminate\x12\x1e.hostagent.v1.TerminateRequest\x1a\x1f.hostagent.v1.TerminateResponseB\xb0\x01\n" + "\x10com.hostagent.v1B\x0eHostagentProtoP\x01Z;git.omukk.dev/wrenn/sandbox/proto/hostagent/gen;hostagentv1\xa2\x02\x03HXX\xaa\x02\fHostagent.V1\xca\x02\fHostagent\\V1\xe2\x02\x18Hostagent\\V1\\GPBMetadata\xea\x02\rHostagent::V1b\x06proto3" var ( @@ -1986,7 +2062,7 @@ func file_hostagent_proto_rawDescGZIP() []byte { return file_hostagent_proto_rawDescData } -var file_hostagent_proto_msgTypes = make([]protoimpl.MessageInfo, 33) +var file_hostagent_proto_msgTypes = make([]protoimpl.MessageInfo, 35) var file_hostagent_proto_goTypes = []any{ (*CreateSandboxRequest)(nil), // 0: hostagent.v1.CreateSandboxRequest (*CreateSandboxResponse)(nil), // 1: hostagent.v1.CreateSandboxResponse @@ -2021,6 +2097,8 @@ var file_hostagent_proto_goTypes = []any{ (*ReadFileStreamResponse)(nil), // 30: hostagent.v1.ReadFileStreamResponse (*PingSandboxRequest)(nil), // 31: hostagent.v1.PingSandboxRequest (*PingSandboxResponse)(nil), // 32: hostagent.v1.PingSandboxResponse + (*TerminateRequest)(nil), // 33: hostagent.v1.TerminateRequest + (*TerminateResponse)(nil), // 34: hostagent.v1.TerminateResponse } var file_hostagent_proto_depIdxs = []int32{ 16, // 0: hostagent.v1.ListSandboxesResponse.sandboxes:type_name -> hostagent.v1.SandboxInfo @@ -2042,22 +2120,24 @@ var file_hostagent_proto_depIdxs = []int32{ 26, // 16: hostagent.v1.HostAgentService.WriteFileStream:input_type -> hostagent.v1.WriteFileStreamRequest 29, // 17: hostagent.v1.HostAgentService.ReadFileStream:input_type -> hostagent.v1.ReadFileStreamRequest 31, // 18: hostagent.v1.HostAgentService.PingSandbox:input_type -> hostagent.v1.PingSandboxRequest - 1, // 19: hostagent.v1.HostAgentService.CreateSandbox:output_type -> hostagent.v1.CreateSandboxResponse - 3, // 20: hostagent.v1.HostAgentService.DestroySandbox:output_type -> hostagent.v1.DestroySandboxResponse - 5, // 21: hostagent.v1.HostAgentService.PauseSandbox:output_type -> hostagent.v1.PauseSandboxResponse - 7, // 22: hostagent.v1.HostAgentService.ResumeSandbox:output_type -> hostagent.v1.ResumeSandboxResponse - 13, // 23: hostagent.v1.HostAgentService.Exec:output_type -> hostagent.v1.ExecResponse - 15, // 24: hostagent.v1.HostAgentService.ListSandboxes:output_type -> hostagent.v1.ListSandboxesResponse - 18, // 25: hostagent.v1.HostAgentService.WriteFile:output_type -> hostagent.v1.WriteFileResponse - 20, // 26: hostagent.v1.HostAgentService.ReadFile:output_type -> hostagent.v1.ReadFileResponse - 9, // 27: hostagent.v1.HostAgentService.CreateSnapshot:output_type -> hostagent.v1.CreateSnapshotResponse - 11, // 28: hostagent.v1.HostAgentService.DeleteSnapshot:output_type -> hostagent.v1.DeleteSnapshotResponse - 22, // 29: hostagent.v1.HostAgentService.ExecStream:output_type -> hostagent.v1.ExecStreamResponse - 28, // 30: hostagent.v1.HostAgentService.WriteFileStream:output_type -> hostagent.v1.WriteFileStreamResponse - 30, // 31: hostagent.v1.HostAgentService.ReadFileStream:output_type -> hostagent.v1.ReadFileStreamResponse - 32, // 32: hostagent.v1.HostAgentService.PingSandbox:output_type -> hostagent.v1.PingSandboxResponse - 19, // [19:33] is the sub-list for method output_type - 5, // [5:19] is the sub-list for method input_type + 33, // 19: hostagent.v1.HostAgentService.Terminate:input_type -> hostagent.v1.TerminateRequest + 1, // 20: hostagent.v1.HostAgentService.CreateSandbox:output_type -> hostagent.v1.CreateSandboxResponse + 3, // 21: hostagent.v1.HostAgentService.DestroySandbox:output_type -> hostagent.v1.DestroySandboxResponse + 5, // 22: hostagent.v1.HostAgentService.PauseSandbox:output_type -> hostagent.v1.PauseSandboxResponse + 7, // 23: hostagent.v1.HostAgentService.ResumeSandbox:output_type -> hostagent.v1.ResumeSandboxResponse + 13, // 24: hostagent.v1.HostAgentService.Exec:output_type -> hostagent.v1.ExecResponse + 15, // 25: hostagent.v1.HostAgentService.ListSandboxes:output_type -> hostagent.v1.ListSandboxesResponse + 18, // 26: hostagent.v1.HostAgentService.WriteFile:output_type -> hostagent.v1.WriteFileResponse + 20, // 27: hostagent.v1.HostAgentService.ReadFile:output_type -> hostagent.v1.ReadFileResponse + 9, // 28: hostagent.v1.HostAgentService.CreateSnapshot:output_type -> hostagent.v1.CreateSnapshotResponse + 11, // 29: hostagent.v1.HostAgentService.DeleteSnapshot:output_type -> hostagent.v1.DeleteSnapshotResponse + 22, // 30: hostagent.v1.HostAgentService.ExecStream:output_type -> hostagent.v1.ExecStreamResponse + 28, // 31: hostagent.v1.HostAgentService.WriteFileStream:output_type -> hostagent.v1.WriteFileStreamResponse + 30, // 32: hostagent.v1.HostAgentService.ReadFileStream:output_type -> hostagent.v1.ReadFileStreamResponse + 32, // 33: hostagent.v1.HostAgentService.PingSandbox:output_type -> hostagent.v1.PingSandboxResponse + 34, // 34: hostagent.v1.HostAgentService.Terminate:output_type -> hostagent.v1.TerminateResponse + 20, // [20:35] is the sub-list for method output_type + 5, // [5:20] is the sub-list for method input_type 5, // [5:5] is the sub-list for extension type_name 5, // [5:5] is the sub-list for extension extendee 0, // [0:5] is the sub-list for field type_name @@ -2087,7 +2167,7 @@ func file_hostagent_proto_init() { GoPackagePath: reflect.TypeOf(x{}).PkgPath(), RawDescriptor: unsafe.Slice(unsafe.StringData(file_hostagent_proto_rawDesc), len(file_hostagent_proto_rawDesc)), NumEnums: 0, - NumMessages: 33, + NumMessages: 35, NumExtensions: 0, NumServices: 1, }, diff --git a/proto/hostagent/gen/hostagentv1connect/hostagent.connect.go b/proto/hostagent/gen/hostagentv1connect/hostagent.connect.go index 6eb5d45..d144451 100644 --- a/proto/hostagent/gen/hostagentv1connect/hostagent.connect.go +++ b/proto/hostagent/gen/hostagentv1connect/hostagent.connect.go @@ -74,6 +74,9 @@ const ( // HostAgentServicePingSandboxProcedure is the fully-qualified name of the HostAgentService's // PingSandbox RPC. HostAgentServicePingSandboxProcedure = "/hostagent.v1.HostAgentService/PingSandbox" + // HostAgentServiceTerminateProcedure is the fully-qualified name of the HostAgentService's + // Terminate RPC. + HostAgentServiceTerminateProcedure = "/hostagent.v1.HostAgentService/Terminate" ) // HostAgentServiceClient is a client for the hostagent.v1.HostAgentService service. @@ -108,6 +111,10 @@ type HostAgentServiceClient interface { ReadFileStream(context.Context, *connect.Request[gen.ReadFileStreamRequest]) (*connect.ServerStreamForClient[gen.ReadFileStreamResponse], error) // PingSandbox resets the inactivity timer for a running sandbox. PingSandbox(context.Context, *connect.Request[gen.PingSandboxRequest]) (*connect.Response[gen.PingSandboxResponse], error) + // Terminate instructs the host agent to destroy all sandboxes and exit. + // Called by the control plane immediately when a host is deleted so the + // agent shuts down without waiting for the next heartbeat cycle. + Terminate(context.Context, *connect.Request[gen.TerminateRequest]) (*connect.Response[gen.TerminateResponse], error) } // NewHostAgentServiceClient constructs a client for the hostagent.v1.HostAgentService service. By @@ -205,6 +212,12 @@ func NewHostAgentServiceClient(httpClient connect.HTTPClient, baseURL string, op connect.WithSchema(hostAgentServiceMethods.ByName("PingSandbox")), connect.WithClientOptions(opts...), ), + terminate: connect.NewClient[gen.TerminateRequest, gen.TerminateResponse]( + httpClient, + baseURL+HostAgentServiceTerminateProcedure, + connect.WithSchema(hostAgentServiceMethods.ByName("Terminate")), + connect.WithClientOptions(opts...), + ), } } @@ -224,6 +237,7 @@ type hostAgentServiceClient struct { writeFileStream *connect.Client[gen.WriteFileStreamRequest, gen.WriteFileStreamResponse] readFileStream *connect.Client[gen.ReadFileStreamRequest, gen.ReadFileStreamResponse] pingSandbox *connect.Client[gen.PingSandboxRequest, gen.PingSandboxResponse] + terminate *connect.Client[gen.TerminateRequest, gen.TerminateResponse] } // CreateSandbox calls hostagent.v1.HostAgentService.CreateSandbox. @@ -296,6 +310,11 @@ func (c *hostAgentServiceClient) PingSandbox(ctx context.Context, req *connect.R return c.pingSandbox.CallUnary(ctx, req) } +// Terminate calls hostagent.v1.HostAgentService.Terminate. +func (c *hostAgentServiceClient) Terminate(ctx context.Context, req *connect.Request[gen.TerminateRequest]) (*connect.Response[gen.TerminateResponse], error) { + return c.terminate.CallUnary(ctx, req) +} + // HostAgentServiceHandler is an implementation of the hostagent.v1.HostAgentService service. type HostAgentServiceHandler interface { // CreateSandbox boots a new microVM with the given configuration. @@ -328,6 +347,10 @@ type HostAgentServiceHandler interface { ReadFileStream(context.Context, *connect.Request[gen.ReadFileStreamRequest], *connect.ServerStream[gen.ReadFileStreamResponse]) error // PingSandbox resets the inactivity timer for a running sandbox. PingSandbox(context.Context, *connect.Request[gen.PingSandboxRequest]) (*connect.Response[gen.PingSandboxResponse], error) + // Terminate instructs the host agent to destroy all sandboxes and exit. + // Called by the control plane immediately when a host is deleted so the + // agent shuts down without waiting for the next heartbeat cycle. + Terminate(context.Context, *connect.Request[gen.TerminateRequest]) (*connect.Response[gen.TerminateResponse], error) } // NewHostAgentServiceHandler builds an HTTP handler from the service implementation. It returns the @@ -421,6 +444,12 @@ func NewHostAgentServiceHandler(svc HostAgentServiceHandler, opts ...connect.Han connect.WithSchema(hostAgentServiceMethods.ByName("PingSandbox")), connect.WithHandlerOptions(opts...), ) + hostAgentServiceTerminateHandler := connect.NewUnaryHandler( + HostAgentServiceTerminateProcedure, + svc.Terminate, + connect.WithSchema(hostAgentServiceMethods.ByName("Terminate")), + connect.WithHandlerOptions(opts...), + ) return "/hostagent.v1.HostAgentService/", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { switch r.URL.Path { case HostAgentServiceCreateSandboxProcedure: @@ -451,6 +480,8 @@ func NewHostAgentServiceHandler(svc HostAgentServiceHandler, opts ...connect.Han hostAgentServiceReadFileStreamHandler.ServeHTTP(w, r) case HostAgentServicePingSandboxProcedure: hostAgentServicePingSandboxHandler.ServeHTTP(w, r) + case HostAgentServiceTerminateProcedure: + hostAgentServiceTerminateHandler.ServeHTTP(w, r) default: http.NotFound(w, r) } @@ -515,3 +546,7 @@ func (UnimplementedHostAgentServiceHandler) ReadFileStream(context.Context, *con func (UnimplementedHostAgentServiceHandler) PingSandbox(context.Context, *connect.Request[gen.PingSandboxRequest]) (*connect.Response[gen.PingSandboxResponse], error) { return nil, connect.NewError(connect.CodeUnimplemented, errors.New("hostagent.v1.HostAgentService.PingSandbox is not implemented")) } + +func (UnimplementedHostAgentServiceHandler) Terminate(context.Context, *connect.Request[gen.TerminateRequest]) (*connect.Response[gen.TerminateResponse], error) { + return nil, connect.NewError(connect.CodeUnimplemented, errors.New("hostagent.v1.HostAgentService.Terminate is not implemented")) +} diff --git a/proto/hostagent/hostagent.proto b/proto/hostagent/hostagent.proto index b9ceccf..c9cfffa 100644 --- a/proto/hostagent/hostagent.proto +++ b/proto/hostagent/hostagent.proto @@ -49,6 +49,11 @@ service HostAgentService { // PingSandbox resets the inactivity timer for a running sandbox. rpc PingSandbox(PingSandboxRequest) returns (PingSandboxResponse); + // Terminate instructs the host agent to destroy all sandboxes and exit. + // Called by the control plane immediately when a host is deleted so the + // agent shuts down without waiting for the next heartbeat cycle. + rpc Terminate(TerminateRequest) returns (TerminateResponse); + } message CreateSandboxRequest { @@ -236,3 +241,10 @@ message PingSandboxRequest { message PingSandboxResponse {} + + +// ── Terminate ──────────────────────────────────────────────────────── + +message TerminateRequest {} + +message TerminateResponse {}