diff --git a/cmd/control-plane/main.go b/cmd/control-plane/main.go index a7d2371..9f84edc 100644 --- a/cmd/control-plane/main.go +++ b/cmd/control-plane/main.go @@ -94,6 +94,10 @@ func main() { monitor := api.NewHostMonitor(queries, hostPool, audit.New(queries), 30*time.Second) monitor.Start(ctx) + // Start metrics sampler (records per-team sandbox stats every 10s). + sampler := api.NewMetricsSampler(queries, 10*time.Second) + sampler.Start(ctx) + httpServer := &http.Server{ Addr: cfg.ListenAddr, Handler: srv.Handler(), diff --git a/db/migrations/20260325074949_metrics_snapshots.sql b/db/migrations/20260325074949_metrics_snapshots.sql new file mode 100644 index 0000000..7d373e8 --- /dev/null +++ b/db/migrations/20260325074949_metrics_snapshots.sql @@ -0,0 +1,18 @@ +-- +goose Up + +CREATE TABLE sandbox_metrics_snapshots ( + id BIGSERIAL PRIMARY KEY, + team_id TEXT NOT NULL, + sampled_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + running_count INTEGER NOT NULL, + vcpus_reserved INTEGER NOT NULL, + memory_mb_reserved INTEGER NOT NULL +); + +-- All queries filter on team_id first then range-scan sampled_at. +CREATE INDEX idx_metrics_snapshots_team_time + ON sandbox_metrics_snapshots (team_id, sampled_at DESC); + +-- +goose Down + +DROP TABLE sandbox_metrics_snapshots; diff --git a/db/migrations/20260325135035_add_sandbox_metric_points.sql b/db/migrations/20260325135035_add_sandbox_metric_points.sql new file mode 100644 index 0000000..08e8683 --- /dev/null +++ b/db/migrations/20260325135035_add_sandbox_metric_points.sql @@ -0,0 +1,16 @@ +-- +goose Up +CREATE TABLE sandbox_metric_points ( + sandbox_id TEXT NOT NULL, + tier TEXT NOT NULL CHECK (tier IN ('10m', '2h', '24h')), + ts BIGINT NOT NULL, + cpu_pct FLOAT8 NOT NULL DEFAULT 0, + mem_bytes BIGINT NOT NULL DEFAULT 0, + disk_bytes BIGINT NOT NULL DEFAULT 0, + PRIMARY KEY (sandbox_id, tier, ts) +); + +CREATE INDEX idx_sandbox_metric_points_sandbox_tier + ON sandbox_metric_points (sandbox_id, tier); + +-- +goose Down +DROP TABLE IF EXISTS sandbox_metric_points; diff --git a/db/queries/metrics.sql b/db/queries/metrics.sql new file mode 100644 index 0000000..f58d480 --- /dev/null +++ b/db/queries/metrics.sql @@ -0,0 +1,68 @@ +-- name: InsertMetricsSnapshot :exec +INSERT INTO sandbox_metrics_snapshots (team_id, running_count, vcpus_reserved, memory_mb_reserved) +VALUES ($1, $2, $3, $4); + +-- name: GetLiveMetrics :one +-- Reads directly from sandboxes for accurate real-time current values. +-- CPU reserved = running + starting only (paused VMs release CPU). +-- RAM reserved = running + starting + sum(ceil(each_paused/2)) (per-VM ceiling). +SELECT + (COUNT(*) FILTER (WHERE status IN ('running', 'starting')))::INTEGER AS running_count, + (COALESCE(SUM(vcpus) FILTER (WHERE status IN ('running', 'starting')), 0))::INTEGER AS vcpus_reserved, + (COALESCE(SUM(memory_mb) FILTER (WHERE status IN ('running', 'starting')), 0) + + COALESCE(SUM(CEIL(memory_mb::NUMERIC / 2)) FILTER (WHERE status = 'paused'), 0))::INTEGER AS memory_mb_reserved +FROM sandboxes +WHERE team_id = $1; + +-- name: GetPeakMetrics :one +SELECT + COALESCE(MAX(running_count), 0)::INTEGER AS peak_running_count, + COALESCE(MAX(vcpus_reserved), 0)::INTEGER AS peak_vcpus, + COALESCE(MAX(memory_mb_reserved), 0)::INTEGER AS peak_memory_mb +FROM sandbox_metrics_snapshots +WHERE team_id = $1 + AND sampled_at > NOW() - INTERVAL '30 days'; + +-- name: PruneOldMetrics :exec +DELETE FROM sandbox_metrics_snapshots +WHERE sampled_at < NOW() - INTERVAL '60 days'; + +-- name: InsertSandboxMetricPoint :exec +INSERT INTO sandbox_metric_points (sandbox_id, tier, ts, cpu_pct, mem_bytes, disk_bytes) +VALUES ($1, $2, $3, $4, $5, $6) +ON CONFLICT (sandbox_id, tier, ts) DO NOTHING; + +-- name: GetSandboxMetricPoints :many +SELECT ts, cpu_pct, mem_bytes, disk_bytes +FROM sandbox_metric_points +WHERE sandbox_id = $1 AND tier = $2 AND ts >= $3 +ORDER BY ts ASC; + +-- name: DeleteSandboxMetricPoints :exec +DELETE FROM sandbox_metric_points +WHERE sandbox_id = $1; + +-- name: DeleteSandboxMetricPointsByTier :exec +DELETE FROM sandbox_metric_points +WHERE sandbox_id = $1 AND tier = $2; + +-- name: PruneSandboxMetricPoints :exec +-- Remove metric points older than 30 days for destroyed sandboxes. +DELETE FROM sandbox_metric_points +WHERE ts < EXTRACT(EPOCH FROM NOW() - INTERVAL '30 days')::BIGINT; + +-- name: SampleSandboxMetrics :many +-- Aggregates per-team resource usage from the live sandboxes table. +-- Groups by all teams that have any sandbox row (including stopped) so that +-- zero-value snapshots are recorded when all capsules are stopped, keeping the +-- time-series charts continuous rather than trailing off into empty space. +-- CPU reserved = running + starting only (paused VMs release CPU). +-- RAM reserved = running + starting + sum(ceil(each_paused/2)) (per-VM ceiling). +SELECT + team_id, + (COUNT(*) FILTER (WHERE status IN ('running', 'starting')))::INTEGER AS running_count, + (COALESCE(SUM(vcpus) FILTER (WHERE status IN ('running', 'starting')), 0))::INTEGER AS vcpus_reserved, + (COALESCE(SUM(memory_mb) FILTER (WHERE status IN ('running', 'starting')), 0) + + COALESCE(SUM(CEIL(memory_mb::NUMERIC / 2)) FILTER (WHERE status = 'paused'), 0))::INTEGER AS memory_mb_reserved +FROM sandboxes +GROUP BY team_id; diff --git a/frontend/package.json b/frontend/package.json index f694403..85030ec 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -26,5 +26,8 @@ "tailwindcss": "^4.2.1", "typescript": "^5.9.3", "vite": "^7.3.1" + }, + "dependencies": { + "chart.js": "^4.5.1" } } diff --git a/frontend/pnpm-lock.yaml b/frontend/pnpm-lock.yaml index 9f0353e..5b60992 100644 --- a/frontend/pnpm-lock.yaml +++ b/frontend/pnpm-lock.yaml @@ -7,6 +7,10 @@ settings: importers: .: + dependencies: + chart.js: + specifier: ^4.5.1 + version: 4.5.1 devDependencies: '@fontsource-variable/jetbrains-mono': specifier: ^5.2.8 @@ -249,6 +253,9 @@ packages: '@jridgewell/trace-mapping@0.3.31': resolution: {integrity: sha512-zzNR+SdQSDJzc8joaeP8QQoCQr8NuYx2dIIytl1QeBEZHJ9uW6hebsrYgbz8hJwUQao3TWCMtmfV8Nu1twOLAw==} + '@kurkle/color@0.3.4': + resolution: {integrity: sha512-M5UknZPHRu3DEDWoipU6sE8PdkZ6Z/S+v4dD+Ke8IaNlpdSQah50lz1KtcFBa2vsdOnwbbnxJwVM4wty6udA5w==} + '@polka/url@1.0.0-next.29': resolution: {integrity: sha512-wwQAWhWSuHaag8c4q/KN/vCoeOJYshAIvMQwD4GpSb3OiZklFfvAgmj0VCBBImRpuF/aFgIRzllXlVX93Jevww==} @@ -547,6 +554,10 @@ packages: '@internationalized/date': ^3.8.1 svelte: ^5.33.0 + chart.js@4.5.1: + resolution: {integrity: sha512-GIjfiT9dbmHRiYi6Nl2yFCq7kkwdkp1W/lp2J99rX0yo9tgJGn3lKQATztIjb5tVtevcBtIdICNWqlq5+E8/Pw==} + engines: {pnpm: '>=8'} + chokidar@4.0.3: resolution: {integrity: sha512-Qgzu8kfBvo+cA4962jnP1KkS6Dop5NS6g7R5LFYJr4b8Ub94PPQXUksCw9PvXoeXPRRddRNC5C1JQUR2SMGtnA==} engines: {node: '>= 14.16.0'} @@ -980,6 +991,8 @@ snapshots: '@jridgewell/resolve-uri': 3.1.2 '@jridgewell/sourcemap-codec': 1.5.5 + '@kurkle/color@0.3.4': {} + '@polka/url@1.0.0-next.29': {} '@rollup/rollup-android-arm-eabi@4.59.0': @@ -1203,6 +1216,10 @@ snapshots: transitivePeerDependencies: - '@sveltejs/kit' + chart.js@4.5.1: + dependencies: + '@kurkle/color': 0.3.4 + chokidar@4.0.3: dependencies: readdirp: 4.1.2 diff --git a/frontend/src/app.css b/frontend/src/app.css index 0a6b995..9c2e326 100644 --- a/frontend/src/app.css +++ b/frontend/src/app.css @@ -69,8 +69,10 @@ --radius-avatar: 5px; --radius-logo: 6px; - /* Shadows — flat aesthetic */ - --shadow-sm: 0 0 #0000; + /* Shadows */ + --shadow-sm: 0 1px 3px rgba(0, 0, 0, 0.35), 0 1px 2px rgba(0, 0, 0, 0.2); + --shadow-card: 0 4px 12px rgba(0, 0, 0, 0.4), 0 1px 3px rgba(0, 0, 0, 0.25); + --shadow-dialog: 0 16px 48px rgba(0, 0, 0, 0.6), 0 4px 12px rgba(0, 0, 0, 0.35); } /* Base styles */ @@ -131,6 +133,24 @@ body { } } +/* Outward ring ripple — for live/running status dots; more delightful than opacity-only */ +@keyframes status-ping { + 0% { + transform: scale(1); + opacity: 0.8; + } + 80%, + 100% { + transform: scale(2.8); + opacity: 0; + } +} + +.animate-status-ping { + animation: status-ping 2s cubic-bezier(0, 0, 0.2, 1) infinite; + will-change: transform, opacity; +} + /* Fade-up entrance animation */ @keyframes fadeUp { from { diff --git a/frontend/src/lib/api/capsules.ts b/frontend/src/lib/api/capsules.ts index c51737a..cc4ad79 100644 --- a/frontend/src/lib/api/capsules.ts +++ b/frontend/src/lib/api/capsules.ts @@ -20,6 +20,10 @@ export async function listCapsules(): Promise> { return apiFetch('GET', '/api/v1/sandboxes'); } +export async function getCapsule(id: string): Promise> { + return apiFetch('GET', `/api/v1/sandboxes/${id}`); +} + export type CreateCapsuleParams = { template?: string; vcpus?: number; diff --git a/frontend/src/lib/api/metrics.ts b/frontend/src/lib/api/metrics.ts new file mode 100644 index 0000000..baf9f11 --- /dev/null +++ b/frontend/src/lib/api/metrics.ts @@ -0,0 +1,25 @@ +import { apiFetch, type ApiResult } from '$lib/api/client'; + +export type MetricRange = '5m' | '10m' | '1h' | '6h' | '24h'; + +export type MetricPoint = { + timestamp_unix: number; + cpu_pct: number; + mem_bytes: number; + disk_bytes: number; +}; + +export type MetricsResponse = { + sandbox_id: string; + range: MetricRange; + points: MetricPoint[]; +}; + +export async function fetchSandboxMetrics(id: string, range: MetricRange): Promise> { + return apiFetch('GET', `/api/v1/sandboxes/${id}/metrics?range=${range}`); +} + +export const METRIC_RANGES: MetricRange[] = ['5m', '10m', '1h', '6h', '24h']; + +// All ranges poll every 10 seconds. +export const METRIC_POLL_INTERVAL = 10_000; diff --git a/frontend/src/lib/api/stats.ts b/frontend/src/lib/api/stats.ts new file mode 100644 index 0000000..3f85483 --- /dev/null +++ b/frontend/src/lib/api/stats.ts @@ -0,0 +1,44 @@ +import { apiFetch, type ApiResult } from '$lib/api/client'; + +export type TimeRange = '5m' | '1h' | '6h' | '24h' | '30d'; + +export type StatsResponse = { + range: TimeRange; + current: { + running_count: number; + vcpus_reserved: number; + memory_mb_reserved: number; + sampled_at?: string; + }; + peaks: { + running_count: number; + vcpus: number; + memory_mb: number; + }; + series: { + labels: string[]; + running: number[]; + vcpus: number[]; + memory_mb: number[]; + }; +}; + +export async function fetchStats(range: TimeRange): Promise> { + return apiFetch('GET', `/api/v1/sandboxes/stats?range=${range}`); +} + +export const POLL_INTERVALS: Record = { + '5m': 15_000, + '1h': 30_000, + '6h': 60_000, + '24h': 120_000, + '30d': 300_000, +}; + +export const RANGE_LABELS: Record = { + '5m': '5m', + '1h': '1h', + '6h': '6h', + '24h': '24h', + '30d': '30d', +}; diff --git a/frontend/src/lib/capsule-store.svelte.ts b/frontend/src/lib/capsule-store.svelte.ts new file mode 100644 index 0000000..acc0f60 --- /dev/null +++ b/frontend/src/lib/capsule-store.svelte.ts @@ -0,0 +1,3 @@ +// Shared state written by the list page and read by the capsules layout +// for the running count badge in the header. +export const capsuleRunningCount = $state({ value: 0 }); diff --git a/frontend/src/lib/components/CreateCapsuleDialog.svelte b/frontend/src/lib/components/CreateCapsuleDialog.svelte new file mode 100644 index 0000000..b570f2b --- /dev/null +++ b/frontend/src/lib/components/CreateCapsuleDialog.svelte @@ -0,0 +1,124 @@ + + +{#if open} +
+ +
{ if (!creating) onclose(); }} + onkeydown={(e) => { if (e.key === 'Escape' && !creating) onclose(); }} + >
+ +
+

Launch Capsule

+

Configure resources and launch. The VM will be ready in under a second.

+ + {#if createError} +
+ {createError} +
+ {/if} + +
+
+ + +
+ +
+
+ + +
+
+ + +
+
+ +
+ + +
+
+ +
+ + +
+
+
+{/if} diff --git a/frontend/src/lib/components/Sidebar.svelte b/frontend/src/lib/components/Sidebar.svelte index c9a7afc..fa6ff29 100644 --- a/frontend/src/lib/components/Sidebar.svelte +++ b/frontend/src/lib/components/Sidebar.svelte @@ -21,7 +21,8 @@ IconDocs, IconAudit, IconServer, - IconShield + IconShield, + IconMetrics } from './icons'; let { collapsed = $bindable(false) }: { collapsed: boolean } = $props(); @@ -47,7 +48,8 @@ const platformItems: NavItem[] = [ { label: 'Capsules', icon: IconMonitor, href: '/dashboard/capsules' }, - { label: 'Templates', icon: IconBox, href: '/dashboard/snapshots' } + { label: 'Templates', icon: IconBox, href: '/dashboard/snapshots' }, + { label: 'Metrics', icon: IconMetrics, href: '/dashboard/metrics' } ]; let currentTeamIsByoc = $derived( @@ -342,19 +344,19 @@ {:else if isActive(item.href)} {#if !collapsed}
{/if} {#if !collapsed} - + {item.label} {/if} @@ -396,7 +398,7 @@

Create Team diff --git a/frontend/src/lib/components/StatsPanel.svelte b/frontend/src/lib/components/StatsPanel.svelte new file mode 100644 index 0000000..d7a2f12 --- /dev/null +++ b/frontend/src/lib/components/StatsPanel.svelte @@ -0,0 +1,427 @@ + + +
+ + +
+ {#if !loading} + + + Live + + {:else} +
+ {/if} +
+ +
+ {#each RANGES as r, i} + + {/each} +
+ {#if onlaunch} + + {/if} +
+
+ + +
+ + +
+
+ + Running Capsules +
+
+
+
Now
+
+ {loading ? '—' : (stats?.current.running_count ?? 0)} +
+
+
+
Peak · 30d
+
+ {loading ? '—' : (stats?.peaks.running_count ?? 0)} +
+
+
+
+ + +
+
+ + CPU · vCPUs +
+
+
+
Reserved now
+
+ {loading ? '—' : (stats?.current.vcpus_reserved ?? 0)} +
+
+
+
Peak · 30d
+
+ {loading ? '—' : (stats?.peaks.vcpus ?? 0)} +
+
+
+
+ + +
+
+ + RAM +
+
+
+
Reserved now
+
+ {loading ? '—' : fmtGB(stats?.current.memory_mb_reserved ?? 0)} +
+
+
+
Peak · 30d
+
+ {loading ? '—' : fmtGB(stats?.peaks.memory_mb ?? 0)} +
+
+
+
+ +
+ + + {#if error} +
+ + + + Failed to load stats: {error} +
+ {/if} + + +
+ + +
+
+
+ +
Running Capsules
+
+
+
+ +
+
+ + +
+ + +
+
+
+ + CPU · vCPUs +
+
+
+ +
+
+ + +
+
+
+ + RAM · GB +
+
+
+ +
+
+ +
+ +
+ +
diff --git a/frontend/src/lib/components/icons/IconMetrics.svelte b/frontend/src/lib/components/icons/IconMetrics.svelte new file mode 100644 index 0000000..3110642 --- /dev/null +++ b/frontend/src/lib/components/icons/IconMetrics.svelte @@ -0,0 +1,20 @@ + + + diff --git a/frontend/src/lib/components/icons/index.ts b/frontend/src/lib/components/icons/index.ts index fa90069..babf0a5 100644 --- a/frontend/src/lib/components/icons/index.ts +++ b/frontend/src/lib/components/icons/index.ts @@ -26,3 +26,4 @@ export { default as IconBox } from './IconBox.svelte'; export { default as IconServer } from './IconServer.svelte'; export { default as IconGear } from './IconGear.svelte'; export { default as IconShield } from './IconShield.svelte'; +export { default as IconMetrics } from './IconMetrics.svelte'; diff --git a/frontend/src/routes/dashboard/audit/+page.svelte b/frontend/src/routes/dashboard/audit/+page.svelte index 31556bb..1cd4e93 100644 --- a/frontend/src/routes/dashboard/audit/+page.svelte +++ b/frontend/src/routes/dashboard/audit/+page.svelte @@ -556,15 +556,7 @@

+ +{#if capsuleLoading} +
+
+ + + + Loading capsule... +
+
+{:else if capsuleError} +
+
+ + + + {capsuleError} +
+
+{:else if capsule} +
+ + +
+ + + +
+ + + {#if activeTab === 'metrics'} +
+ + +
+ {#if metricsAvailable && !metricsLoading} + + + Live + + {:else} +
+ {/if} + + {#if metricsAvailable} +
+ {#each METRIC_RANGES as r, i} + + {/each} +
+ {/if} +
+ + +
+
+ + +
+
Status
+ + {#if capsule.status === 'running'} + + + + + {/if} + {capsule.status} + +
+ + +
+
Template
+ {capsule.template} +
+ + +
+
CPU
+
+ {capsule.vcpus} + vCPU{capsule.vcpus !== 1 ? 's' : ''} +
+
+ + +
+
Memory
+
+ {capsule.memory_mb} + MB +
+
+ + +
+
Disk
+ +
+ + +
+
Started
+ {fmtDate(capsule.started_at)} +
+ + +
+
Idle Timeout
+ {fmtTimeout(capsule.timeout_sec)} +
+ +
+
+ + {#if metricsError} +
+ + + + Failed to load metrics: {metricsError} +
+ {/if} + + {#if metricsAvailable} + +
+ + +
+
+
+ + CPU Usage +
+ {#if latestCpu !== null} +
+ {latestCpu.toFixed(1)} + % +
+ {:else if metricsLoading} + + {/if} +
+
+ +
+
+ + +
+
+
+ + RAM Usage +
+ {#if latestRamMB !== null} +
+ {latestRamMB.toFixed(0)} + MB +
+ {:else if metricsLoading} + + {/if} +
+
+ +
+
+ +
+ {:else} + +
+ + + Live stats are only available for running or paused capsules — + current status: {capsule.status} + +
+ {/if} + +
+ {/if} +
+{/if} diff --git a/frontend/src/routes/dashboard/keys/+page.svelte b/frontend/src/routes/dashboard/keys/+page.svelte index bdb0ddb..87e9be5 100644 --- a/frontend/src/routes/dashboard/keys/+page.svelte +++ b/frontend/src/routes/dashboard/keys/+page.svelte @@ -262,7 +262,7 @@ onkeydown={(e) => { if (e.key === 'Escape' && !creating) showCreate = false; }} > -
+

New API Key

Name it after its environment or purpose — production, staging, CI. You can't rename it later.

@@ -323,7 +323,7 @@ onkeydown={(e) => { if (e.key === 'Escape') dismissReveal(); }} >
-
+
@@ -404,7 +404,7 @@ onkeydown={(e) => { if (e.key === 'Escape' && !revoking) revokeTarget = null; }} >
-
+

Revoke Key

Permanently revoke {revokeTarget.name || revokeTarget.id}. diff --git a/frontend/src/routes/dashboard/metrics/+page.svelte b/frontend/src/routes/dashboard/metrics/+page.svelte new file mode 100644 index 0000000..271c757 --- /dev/null +++ b/frontend/src/routes/dashboard/metrics/+page.svelte @@ -0,0 +1,57 @@ + + + + Wrenn — Metrics + + +

+ + +
+
+
+

+ Metrics +

+

+ Resource usage and performance across all capsules. +

+
+ + { showCreateDialog = true; }} + launchDisabled={!auth.teamId} + /> +
+ +
+
+ + + + + All systems operational +
+
+
+
+ + { showCreateDialog = false; }} +/> diff --git a/frontend/src/routes/dashboard/snapshots/+page.svelte b/frontend/src/routes/dashboard/snapshots/+page.svelte index 7c03149..e39bf3b 100644 --- a/frontend/src/routes/dashboard/snapshots/+page.svelte +++ b/frontend/src/routes/dashboard/snapshots/+page.svelte @@ -250,13 +250,24 @@
- {#each ([['all', 'All'], ['snapshot', 'Snapshots'], ['base', 'Images']] as const) as [val, label]} + {#each ([['all', 'All', ''], ['snapshot', 'Snapshots', 'var(--color-accent)'], ['base', 'Images', 'var(--color-blue)']] as const) as [val, label, color]} {/each} @@ -322,14 +333,18 @@ {#each filteredSnapshots as snapshot, i (snapshot.name)} - {@const stripeColor = snapshot.type === 'snapshot' ? 'bg-[var(--color-accent)]' : 'bg-[var(--color-blue)]'} + {@const isSnapshot = snapshot.type === 'snapshot'} + {@const typeColor = isSnapshot ? 'var(--color-accent)' : 'var(--color-blue)'}
-
+ +
+
{snapshot.name} @@ -337,8 +352,8 @@
- {#if snapshot.type === 'snapshot'} - + {#if isSnapshot} + {:else} - + Image @@ -356,7 +371,12 @@
{#if snapshot.type === 'snapshot' && snapshot.vcpus != null} - {snapshot.vcpus} + + + + + {snapshot.vcpus} + {:else} {/if} @@ -365,7 +385,12 @@
{#if snapshot.type === 'snapshot' && snapshot.memory_mb != null} - {snapshot.memory_mb} MB + + + + + {snapshot.memory_mb} MB + {:else} {/if} @@ -373,7 +398,7 @@
- {formatBytes(snapshot.size_bytes)} + {formatBytes(snapshot.size_bytes)}
@@ -694,4 +719,12 @@ .snapshot-row:hover .row-stripe { transform: scaleY(1); } + + /* Type-tinted row hover backgrounds */ + .snapshot-row.type-snapshot:hover { + background: rgba(94, 140, 88, 0.04); + } + .snapshot-row.type-image:hover { + background: rgba(90, 159, 212, 0.04); + } diff --git a/frontend/src/routes/login/+page.svelte b/frontend/src/routes/login/+page.svelte index 9685dfb..b3b76e8 100644 --- a/frontend/src/routes/login/+page.svelte +++ b/frontend/src/routes/login/+page.svelte @@ -117,10 +117,17 @@ class="relative hidden w-1/2 flex-col items-center justify-center overflow-hidden bg-[var(--color-bg-1)] lg:flex" onmousemove={handleMouseMove} > - + + + + @@ -137,13 +144,13 @@
- +

Scale Up.
Spin Out.

@@ -151,7 +158,7 @@

Isolated VMs. Milliseconds to live. diff --git a/images/wrenn-init.sh b/images/wrenn-init.sh index bec7731..32285ea 100644 --- a/images/wrenn-init.sh +++ b/images/wrenn-init.sh @@ -15,6 +15,7 @@ mount -t tmpfs tmpfs /tmp 2>/dev/null || true mount -t tmpfs tmpfs /run 2>/dev/null || true mkdir -p /sys/fs/cgroup mount -t cgroup2 cgroup2 /sys/fs/cgroup 2>/dev/null || true +echo "+cpu +memory +io" > /sys/fs/cgroup/cgroup.subtree_control 2>/dev/null || true # Set hostname hostname sandbox diff --git a/internal/api/handlers_metrics.go b/internal/api/handlers_metrics.go new file mode 100644 index 0000000..793349e --- /dev/null +++ b/internal/api/handlers_metrics.go @@ -0,0 +1,148 @@ +package api + +import ( + "context" + "net/http" + "time" + + "connectrpc.com/connect" + "github.com/go-chi/chi/v5" + + "git.omukk.dev/wrenn/sandbox/internal/auth" + "git.omukk.dev/wrenn/sandbox/internal/db" + "git.omukk.dev/wrenn/sandbox/internal/lifecycle" + pb "git.omukk.dev/wrenn/sandbox/proto/hostagent/gen" +) + +type sandboxMetricsHandler struct { + db *db.Queries + pool *lifecycle.HostClientPool +} + +func newSandboxMetricsHandler(db *db.Queries, pool *lifecycle.HostClientPool) *sandboxMetricsHandler { + return &sandboxMetricsHandler{db: db, pool: pool} +} + +type metricPointResponse struct { + TimestampUnix int64 `json:"timestamp_unix"` + CPUPct float64 `json:"cpu_pct"` + MemBytes int64 `json:"mem_bytes"` + DiskBytes int64 `json:"disk_bytes"` +} + +type metricsResponse struct { + SandboxID string `json:"sandbox_id"` + Range string `json:"range"` + Points []metricPointResponse `json:"points"` +} + +// GetMetrics handles GET /v1/sandboxes/{id}/metrics?range=10m|2h|24h. +func (h *sandboxMetricsHandler) GetMetrics(w http.ResponseWriter, r *http.Request) { + sandboxID := chi.URLParam(r, "id") + ctx := r.Context() + ac := auth.MustFromContext(ctx) + + rangeTier := r.URL.Query().Get("range") + if rangeTier == "" { + rangeTier = "10m" + } + validRanges := map[string]bool{"5m": true, "10m": true, "1h": true, "2h": true, "6h": true, "12h": true, "24h": true} + if !validRanges[rangeTier] { + writeError(w, http.StatusBadRequest, "invalid_request", "range must be one of: 5m, 10m, 1h, 2h, 6h, 12h, 24h") + return + } + + sb, err := h.db.GetSandboxByTeam(ctx, db.GetSandboxByTeamParams{ID: sandboxID, TeamID: ac.TeamID}) + if err != nil { + writeError(w, http.StatusNotFound, "not_found", "sandbox not found") + return + } + + switch sb.Status { + case "running": + h.getFromAgent(w, r, sandboxID, rangeTier, sb.HostID) + case "paused": + h.getFromDB(ctx, w, sandboxID, rangeTier) + default: + writeError(w, http.StatusNotFound, "not_found", "metrics not available for sandbox in state: "+sb.Status) + } +} + +func (h *sandboxMetricsHandler) getFromAgent(w http.ResponseWriter, r *http.Request, sandboxID, rangeTier, hostID string) { + ctx := r.Context() + + agent, err := agentForHost(ctx, h.db, h.pool, hostID) + if err != nil { + writeError(w, http.StatusServiceUnavailable, "host_unavailable", "sandbox host is not reachable") + return + } + + resp, err := agent.GetSandboxMetrics(ctx, connect.NewRequest(&pb.GetSandboxMetricsRequest{ + SandboxId: sandboxID, + Range: rangeTier, + })) + if err != nil { + status, code, msg := agentErrToHTTP(err) + writeError(w, status, code, msg) + return + } + + points := make([]metricPointResponse, len(resp.Msg.Points)) + for i, p := range resp.Msg.Points { + points[i] = metricPointResponse{ + TimestampUnix: p.TimestampUnix, + CPUPct: p.CpuPct, + MemBytes: p.MemBytes, + DiskBytes: p.DiskBytes, + } + } + + writeJSON(w, http.StatusOK, metricsResponse{ + SandboxID: sandboxID, + Range: rangeTier, + Points: points, + }) +} + +// rangeToDB maps a user-facing range filter to the DB tier and cutoff duration. +var rangeToDB = map[string]struct { + tier string + cutoff time.Duration +}{ + "5m": {"10m", 5 * time.Minute}, + "10m": {"10m", 10 * time.Minute}, + "1h": {"2h", 1 * time.Hour}, + "2h": {"2h", 2 * time.Hour}, + "6h": {"24h", 6 * time.Hour}, + "12h": {"24h", 12 * time.Hour}, + "24h": {"24h", 24 * time.Hour}, +} + +func (h *sandboxMetricsHandler) getFromDB(ctx context.Context, w http.ResponseWriter, sandboxID, rangeTier string) { + mapping := rangeToDB[rangeTier] + rows, err := h.db.GetSandboxMetricPoints(ctx, db.GetSandboxMetricPointsParams{ + SandboxID: sandboxID, + Tier: mapping.tier, + Ts: time.Now().Add(-mapping.cutoff).Unix(), + }) + if err != nil { + writeError(w, http.StatusInternalServerError, "internal_error", "failed to read metrics") + return + } + + points := make([]metricPointResponse, len(rows)) + for i, row := range rows { + points[i] = metricPointResponse{ + TimestampUnix: row.Ts, + CPUPct: row.CpuPct, + MemBytes: row.MemBytes, + DiskBytes: row.DiskBytes, + } + } + + writeJSON(w, http.StatusOK, metricsResponse{ + SandboxID: sandboxID, + Range: rangeTier, + Points: points, + }) +} diff --git a/internal/api/handlers_stats.go b/internal/api/handlers_stats.go new file mode 100644 index 0000000..9222ffa --- /dev/null +++ b/internal/api/handlers_stats.go @@ -0,0 +1,95 @@ +package api + +import ( + "log/slog" + "net/http" + "time" + + "git.omukk.dev/wrenn/sandbox/internal/auth" + "git.omukk.dev/wrenn/sandbox/internal/service" +) + +type statsHandler struct { + svc *service.StatsService +} + +func newStatsHandler(svc *service.StatsService) *statsHandler { + return &statsHandler{svc: svc} +} + +type statsCurrentResponse struct { + RunningCount int32 `json:"running_count"` + VCPUsReserved int32 `json:"vcpus_reserved"` + MemoryMBReserved int32 `json:"memory_mb_reserved"` +} + +type statsPeaksResponse struct { + RunningCount int32 `json:"running_count"` + VCPUs int32 `json:"vcpus"` + MemoryMB int32 `json:"memory_mb"` +} + +type statsSeriesResponse struct { + Labels []string `json:"labels"` + Running []int32 `json:"running"` + VCPUs []int32 `json:"vcpus"` + MemoryMB []int32 `json:"memory_mb"` +} + +type statsResponse struct { + Range string `json:"range"` + Current statsCurrentResponse `json:"current"` + Peaks statsPeaksResponse `json:"peaks"` + Series statsSeriesResponse `json:"series"` +} + +// GetStats handles GET /v1/sandboxes/stats?range=5m|1h|6h|24h|30d +func (h *statsHandler) GetStats(w http.ResponseWriter, r *http.Request) { + ac := auth.MustFromContext(r.Context()) + + rangeParam := r.URL.Query().Get("range") + if rangeParam == "" { + rangeParam = string(service.Range1h) + } + tr := service.TimeRange(rangeParam) + if !service.ValidRange(tr) { + writeError(w, http.StatusBadRequest, "invalid_request", "range must be one of: 5m, 1h, 6h, 24h, 30d") + return + } + + current, peaks, series, err := h.svc.GetStats(r.Context(), ac.TeamID, tr) + if err != nil { + slog.Error("stats handler: get stats failed", "team_id", ac.TeamID, "error", err) + writeError(w, http.StatusInternalServerError, "internal_error", "failed to retrieve stats") + return + } + + resp := statsResponse{ + Range: rangeParam, + Current: statsCurrentResponse{ + RunningCount: current.RunningCount, + VCPUsReserved: current.VCPUsReserved, + MemoryMBReserved: current.MemoryMBReserved, + }, + Peaks: statsPeaksResponse{ + RunningCount: peaks.RunningCount, + VCPUs: peaks.VCPUs, + MemoryMB: peaks.MemoryMB, + }, + Series: statsSeriesResponse{ + Labels: make([]string, len(series)), + Running: make([]int32, len(series)), + VCPUs: make([]int32, len(series)), + MemoryMB: make([]int32, len(series)), + }, + } + + for i, pt := range series { + resp.Series.Labels[i] = pt.Bucket.UTC().Format(time.RFC3339) + resp.Series.Running[i] = pt.RunningCount + resp.Series.VCPUs[i] = pt.VCPUsReserved + resp.Series.MemoryMB[i] = pt.MemoryMBReserved + } + + writeJSON(w, http.StatusOK, resp) +} diff --git a/internal/api/handlers_users.go b/internal/api/handlers_users.go index 549e213..8269d3c 100644 --- a/internal/api/handlers_users.go +++ b/internal/api/handlers_users.go @@ -4,34 +4,38 @@ import ( "net/http" "strings" + "github.com/jackc/pgx/v5/pgtype" + "git.omukk.dev/wrenn/sandbox/internal/auth" - "git.omukk.dev/wrenn/sandbox/internal/service" + "git.omukk.dev/wrenn/sandbox/internal/db" ) type usersHandler struct { - svc *service.TeamService + db *db.Queries } -func newUsersHandler(svc *service.TeamService) *usersHandler { - return &usersHandler{svc: svc} +func newUsersHandler(db *db.Queries) *usersHandler { + return &usersHandler{db: db} } // Search handles GET /v1/users/search?email= // Returns up to 10 users whose email starts with the given prefix. -// The prefix must be at least 3 characters long. +// The prefix must be at least 3 characters long and contain "@". func (h *usersHandler) Search(w http.ResponseWriter, r *http.Request) { auth.MustFromContext(r.Context()) // ensure authenticated prefix := strings.TrimSpace(r.URL.Query().Get("email")) - if len(prefix) < 3 { - writeError(w, http.StatusBadRequest, "invalid_request", "email prefix must be at least 3 characters") + if len(prefix) < 3 || !strings.Contains(prefix, "@") { + writeError(w, http.StatusBadRequest, "invalid_request", "email prefix must be at least 3 characters and contain '@'") return } - results, err := h.svc.SearchUsersByEmailPrefix(r.Context(), prefix) + // Escape LIKE metacharacters to prevent pattern injection. + escaped := strings.NewReplacer("%", "\\%", "_", "\\_").Replace(prefix) + + results, err := h.db.SearchUsersByEmailPrefix(r.Context(), pgtype.Text{String: escaped, Valid: true}) if err != nil { - status, code, msg := serviceErrToHTTP(err) - writeError(w, status, code, msg) + writeError(w, http.StatusInternalServerError, "internal", "search failed") return } diff --git a/internal/api/metrics_sampler.go b/internal/api/metrics_sampler.go new file mode 100644 index 0000000..7ea3cd0 --- /dev/null +++ b/internal/api/metrics_sampler.go @@ -0,0 +1,68 @@ +package api + +import ( + "context" + "log/slog" + "time" + + "git.omukk.dev/wrenn/sandbox/internal/db" +) + +// MetricsSampler records per-team sandbox resource usage to +// sandbox_metrics_snapshots every interval. It also prunes rows older than +// 60 days on each tick to keep the table bounded. +type MetricsSampler struct { + db *db.Queries + interval time.Duration +} + +// NewMetricsSampler creates a MetricsSampler. +func NewMetricsSampler(queries *db.Queries, interval time.Duration) *MetricsSampler { + return &MetricsSampler{db: queries, interval: interval} +} + +// Start runs the sampler loop until the context is cancelled. +func (s *MetricsSampler) Start(ctx context.Context) { + go func() { + ticker := time.NewTicker(s.interval) + defer ticker.Stop() + + // Sample immediately on startup. + s.run(ctx) + + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + s.run(ctx) + } + } + }() +} + +func (s *MetricsSampler) run(ctx context.Context) { + s.prune(ctx) + if err := s.sample(ctx); err != nil { + slog.Warn("metrics sampler: sample failed", "error", err) + } +} + +func (s *MetricsSampler) sample(ctx context.Context) error { + rows, err := s.db.SampleSandboxMetrics(ctx) + if err != nil { + return err + } + for _, row := range rows { + if err := s.db.InsertMetricsSnapshot(ctx, db.InsertMetricsSnapshotParams(row)); err != nil { + slog.Warn("metrics sampler: insert snapshot failed", "team_id", row.TeamID, "error", err) + } + } + return nil +} + +func (s *MetricsSampler) prune(ctx context.Context) { + if err := s.db.PruneOldMetrics(ctx); err != nil { + slog.Warn("metrics sampler: prune failed", "error", err) + } +} diff --git a/internal/api/openapi.yaml b/internal/api/openapi.yaml index e46fabc..f5ae7a7 100644 --- a/internal/api/openapi.yaml +++ b/internal/api/openapi.yaml @@ -613,6 +613,32 @@ paths: items: $ref: "#/components/schemas/Sandbox" + /v1/sandboxes/stats: + get: + summary: Get sandbox usage stats for your team + operationId: getSandboxStats + tags: [sandboxes] + security: + - apiKeyAuth: [] + parameters: + - name: range + in: query + required: false + schema: + type: string + enum: [5m, 1h, 6h, 24h, 30d] + default: 1h + description: Time window for the time-series data. + responses: + "200": + description: Sandbox stats for the team + content: + application/json: + schema: + $ref: "#/components/schemas/SandboxStats" + "400": + $ref: "#/components/responses/BadRequest" + /v1/sandboxes/{id}: parameters: - name: id @@ -725,6 +751,60 @@ paths: schema: $ref: "#/components/schemas/Error" + /v1/sandboxes/{id}/metrics: + parameters: + - name: id + in: path + required: true + schema: + type: string + + get: + summary: Get per-sandbox resource metrics + operationId: getSandboxMetrics + tags: [sandboxes] + security: + - apiKeyAuth: [] + - bearerAuth: [] + description: | + Returns time-series CPU, memory, and disk metrics for a sandbox. + Three tiers are available with different granularity and retention: + - `10m`: 500ms samples, last 10 minutes + - `2h`: 30-second averages, last 2 hours + - `24h`: 5-minute averages, last 24 hours + + For running sandboxes, data comes from the host agent's in-memory + ring buffer. For paused sandboxes, data is read from persisted + snapshots in the database. Stopped/destroyed sandboxes return 404. + parameters: + - name: range + in: query + required: false + schema: + type: string + enum: ["5m", "10m", "1h", "2h", "6h", "12h", "24h"] + default: "10m" + description: Time range filter to query + responses: + "200": + description: Metrics retrieved + content: + application/json: + schema: + $ref: "#/components/schemas/SandboxMetrics" + "400": + description: Invalid range parameter + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + "404": + description: Sandbox not found or metrics not available + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + /v1/sandboxes/{id}/pause: parameters: - name: id @@ -1578,6 +1658,57 @@ components: after this duration of inactivity (no exec or ping). 0 means no auto-pause. + SandboxStats: + type: object + properties: + range: + type: string + enum: [5m, 1h, 6h, 24h, 30d] + current: + type: object + properties: + running_count: + type: integer + vcpus_reserved: + type: integer + memory_mb_reserved: + type: integer + sampled_at: + type: string + format: date-time + nullable: true + peaks: + type: object + description: Maximum values over the last 30 days. + properties: + running_count: + type: integer + vcpus: + type: integer + memory_mb: + type: integer + series: + type: object + description: Parallel arrays for chart rendering. + properties: + labels: + type: array + items: + type: string + format: date-time + running: + type: array + items: + type: integer + vcpus: + type: array + items: + type: integer + memory_mb: + type: array + items: + type: integer + Sandbox: type: object properties: @@ -1904,6 +2035,38 @@ components: items: $ref: "#/components/schemas/TeamMember" + SandboxMetrics: + type: object + properties: + sandbox_id: + type: string + range: + type: string + enum: ["5m", "10m", "1h", "2h", "6h", "12h", "24h"] + points: + type: array + items: + $ref: "#/components/schemas/MetricPoint" + + MetricPoint: + type: object + properties: + timestamp_unix: + type: integer + format: int64 + cpu_pct: + type: number + format: double + description: "CPU utilization percentage (0-100), normalized to vCPU count" + mem_bytes: + type: integer + format: int64 + description: "Resident memory in bytes (VmRSS of Firecracker process)" + disk_bytes: + type: integer + format: int64 + description: "Allocated disk bytes for the CoW sparse file" + Error: type: object properties: diff --git a/internal/api/server.go b/internal/api/server.go index 366a122..918476b 100644 --- a/internal/api/server.go +++ b/internal/api/server.go @@ -46,6 +46,7 @@ func New( hostSvc := &service.HostService{DB: queries, Redis: rdb, JWT: jwtSecret, Pool: pool} teamSvc := &service.TeamService{DB: queries, Pool: pgPool, HostPool: pool} auditSvc := &service.AuditService{DB: queries} + statsSvc := &service.StatsService{DB: queries, Pool: pgPool} al := audit.New(queries) @@ -60,8 +61,10 @@ func New( apiKeys := newAPIKeyHandler(apiKeySvc, al) hostH := newHostHandler(hostSvc, queries, al) teamH := newTeamHandler(teamSvc, al) - usersH := newUsersHandler(teamSvc) + usersH := newUsersHandler(queries) auditH := newAuditHandler(auditSvc) + statsH := newStatsHandler(statsSvc) + metricsH := newSandboxMetricsHandler(queries, pool) // OpenAPI spec and docs. r.Get("/openapi.yaml", serveOpenAPI) @@ -109,6 +112,7 @@ func New( r.Use(requireAPIKeyOrJWT(queries, jwtSecret)) r.Post("/", sandbox.Create) r.Get("/", sandbox.List) + r.Get("/stats", statsH.GetStats) r.Route("/{id}", func(r chi.Router) { r.Get("/", sandbox.Get) @@ -122,6 +126,7 @@ func New( r.Post("/files/read", files.Download) r.Post("/files/stream/write", filesStream.StreamUpload) r.Post("/files/stream/read", filesStream.StreamDownload) + r.Get("/metrics", metricsH.GetMetrics) }) }) diff --git a/internal/db/metrics.sql.go b/internal/db/metrics.sql.go new file mode 100644 index 0000000..8050155 --- /dev/null +++ b/internal/db/metrics.sql.go @@ -0,0 +1,248 @@ +// Code generated by sqlc. DO NOT EDIT. +// versions: +// sqlc v1.30.0 +// source: metrics.sql + +package db + +import ( + "context" +) + +const deleteSandboxMetricPoints = `-- name: DeleteSandboxMetricPoints :exec +DELETE FROM sandbox_metric_points +WHERE sandbox_id = $1 +` + +func (q *Queries) DeleteSandboxMetricPoints(ctx context.Context, sandboxID string) error { + _, err := q.db.Exec(ctx, deleteSandboxMetricPoints, sandboxID) + return err +} + +const deleteSandboxMetricPointsByTier = `-- name: DeleteSandboxMetricPointsByTier :exec +DELETE FROM sandbox_metric_points +WHERE sandbox_id = $1 AND tier = $2 +` + +type DeleteSandboxMetricPointsByTierParams struct { + SandboxID string `json:"sandbox_id"` + Tier string `json:"tier"` +} + +func (q *Queries) DeleteSandboxMetricPointsByTier(ctx context.Context, arg DeleteSandboxMetricPointsByTierParams) error { + _, err := q.db.Exec(ctx, deleteSandboxMetricPointsByTier, arg.SandboxID, arg.Tier) + return err +} + +const getLiveMetrics = `-- name: GetLiveMetrics :one +SELECT + (COUNT(*) FILTER (WHERE status IN ('running', 'starting')))::INTEGER AS running_count, + (COALESCE(SUM(vcpus) FILTER (WHERE status IN ('running', 'starting')), 0))::INTEGER AS vcpus_reserved, + (COALESCE(SUM(memory_mb) FILTER (WHERE status IN ('running', 'starting')), 0) + + COALESCE(SUM(CEIL(memory_mb::NUMERIC / 2)) FILTER (WHERE status = 'paused'), 0))::INTEGER AS memory_mb_reserved +FROM sandboxes +WHERE team_id = $1 +` + +type GetLiveMetricsRow struct { + RunningCount int32 `json:"running_count"` + VcpusReserved int32 `json:"vcpus_reserved"` + MemoryMbReserved int32 `json:"memory_mb_reserved"` +} + +// Reads directly from sandboxes for accurate real-time current values. +// CPU reserved = running + starting only (paused VMs release CPU). +// RAM reserved = running + starting + sum(ceil(each_paused/2)) (per-VM ceiling). +func (q *Queries) GetLiveMetrics(ctx context.Context, teamID string) (GetLiveMetricsRow, error) { + row := q.db.QueryRow(ctx, getLiveMetrics, teamID) + var i GetLiveMetricsRow + err := row.Scan(&i.RunningCount, &i.VcpusReserved, &i.MemoryMbReserved) + return i, err +} + +const getPeakMetrics = `-- name: GetPeakMetrics :one +SELECT + COALESCE(MAX(running_count), 0)::INTEGER AS peak_running_count, + COALESCE(MAX(vcpus_reserved), 0)::INTEGER AS peak_vcpus, + COALESCE(MAX(memory_mb_reserved), 0)::INTEGER AS peak_memory_mb +FROM sandbox_metrics_snapshots +WHERE team_id = $1 + AND sampled_at > NOW() - INTERVAL '30 days' +` + +type GetPeakMetricsRow struct { + PeakRunningCount int32 `json:"peak_running_count"` + PeakVcpus int32 `json:"peak_vcpus"` + PeakMemoryMb int32 `json:"peak_memory_mb"` +} + +func (q *Queries) GetPeakMetrics(ctx context.Context, teamID string) (GetPeakMetricsRow, error) { + row := q.db.QueryRow(ctx, getPeakMetrics, teamID) + var i GetPeakMetricsRow + err := row.Scan(&i.PeakRunningCount, &i.PeakVcpus, &i.PeakMemoryMb) + return i, err +} + +const getSandboxMetricPoints = `-- name: GetSandboxMetricPoints :many +SELECT ts, cpu_pct, mem_bytes, disk_bytes +FROM sandbox_metric_points +WHERE sandbox_id = $1 AND tier = $2 AND ts >= $3 +ORDER BY ts ASC +` + +type GetSandboxMetricPointsParams struct { + SandboxID string `json:"sandbox_id"` + Tier string `json:"tier"` + Ts int64 `json:"ts"` +} + +type GetSandboxMetricPointsRow struct { + Ts int64 `json:"ts"` + CpuPct float64 `json:"cpu_pct"` + MemBytes int64 `json:"mem_bytes"` + DiskBytes int64 `json:"disk_bytes"` +} + +func (q *Queries) GetSandboxMetricPoints(ctx context.Context, arg GetSandboxMetricPointsParams) ([]GetSandboxMetricPointsRow, error) { + rows, err := q.db.Query(ctx, getSandboxMetricPoints, arg.SandboxID, arg.Tier, arg.Ts) + if err != nil { + return nil, err + } + defer rows.Close() + var items []GetSandboxMetricPointsRow + for rows.Next() { + var i GetSandboxMetricPointsRow + if err := rows.Scan( + &i.Ts, + &i.CpuPct, + &i.MemBytes, + &i.DiskBytes, + ); err != nil { + return nil, err + } + items = append(items, i) + } + if err := rows.Err(); err != nil { + return nil, err + } + return items, nil +} + +const insertMetricsSnapshot = `-- name: InsertMetricsSnapshot :exec +INSERT INTO sandbox_metrics_snapshots (team_id, running_count, vcpus_reserved, memory_mb_reserved) +VALUES ($1, $2, $3, $4) +` + +type InsertMetricsSnapshotParams struct { + TeamID string `json:"team_id"` + RunningCount int32 `json:"running_count"` + VcpusReserved int32 `json:"vcpus_reserved"` + MemoryMbReserved int32 `json:"memory_mb_reserved"` +} + +func (q *Queries) InsertMetricsSnapshot(ctx context.Context, arg InsertMetricsSnapshotParams) error { + _, err := q.db.Exec(ctx, insertMetricsSnapshot, + arg.TeamID, + arg.RunningCount, + arg.VcpusReserved, + arg.MemoryMbReserved, + ) + return err +} + +const insertSandboxMetricPoint = `-- name: InsertSandboxMetricPoint :exec +INSERT INTO sandbox_metric_points (sandbox_id, tier, ts, cpu_pct, mem_bytes, disk_bytes) +VALUES ($1, $2, $3, $4, $5, $6) +ON CONFLICT (sandbox_id, tier, ts) DO NOTHING +` + +type InsertSandboxMetricPointParams struct { + SandboxID string `json:"sandbox_id"` + Tier string `json:"tier"` + Ts int64 `json:"ts"` + CpuPct float64 `json:"cpu_pct"` + MemBytes int64 `json:"mem_bytes"` + DiskBytes int64 `json:"disk_bytes"` +} + +func (q *Queries) InsertSandboxMetricPoint(ctx context.Context, arg InsertSandboxMetricPointParams) error { + _, err := q.db.Exec(ctx, insertSandboxMetricPoint, + arg.SandboxID, + arg.Tier, + arg.Ts, + arg.CpuPct, + arg.MemBytes, + arg.DiskBytes, + ) + return err +} + +const pruneOldMetrics = `-- name: PruneOldMetrics :exec +DELETE FROM sandbox_metrics_snapshots +WHERE sampled_at < NOW() - INTERVAL '60 days' +` + +func (q *Queries) PruneOldMetrics(ctx context.Context) error { + _, err := q.db.Exec(ctx, pruneOldMetrics) + return err +} + +const pruneSandboxMetricPoints = `-- name: PruneSandboxMetricPoints :exec +DELETE FROM sandbox_metric_points +WHERE ts < EXTRACT(EPOCH FROM NOW() - INTERVAL '30 days')::BIGINT +` + +// Remove metric points older than 30 days for destroyed sandboxes. +func (q *Queries) PruneSandboxMetricPoints(ctx context.Context) error { + _, err := q.db.Exec(ctx, pruneSandboxMetricPoints) + return err +} + +const sampleSandboxMetrics = `-- name: SampleSandboxMetrics :many +SELECT + team_id, + (COUNT(*) FILTER (WHERE status IN ('running', 'starting')))::INTEGER AS running_count, + (COALESCE(SUM(vcpus) FILTER (WHERE status IN ('running', 'starting')), 0))::INTEGER AS vcpus_reserved, + (COALESCE(SUM(memory_mb) FILTER (WHERE status IN ('running', 'starting')), 0) + + COALESCE(SUM(CEIL(memory_mb::NUMERIC / 2)) FILTER (WHERE status = 'paused'), 0))::INTEGER AS memory_mb_reserved +FROM sandboxes +GROUP BY team_id +` + +type SampleSandboxMetricsRow struct { + TeamID string `json:"team_id"` + RunningCount int32 `json:"running_count"` + VcpusReserved int32 `json:"vcpus_reserved"` + MemoryMbReserved int32 `json:"memory_mb_reserved"` +} + +// Aggregates per-team resource usage from the live sandboxes table. +// Groups by all teams that have any sandbox row (including stopped) so that +// zero-value snapshots are recorded when all capsules are stopped, keeping the +// time-series charts continuous rather than trailing off into empty space. +// CPU reserved = running + starting only (paused VMs release CPU). +// RAM reserved = running + starting + sum(ceil(each_paused/2)) (per-VM ceiling). +func (q *Queries) SampleSandboxMetrics(ctx context.Context) ([]SampleSandboxMetricsRow, error) { + rows, err := q.db.Query(ctx, sampleSandboxMetrics) + if err != nil { + return nil, err + } + defer rows.Close() + var items []SampleSandboxMetricsRow + for rows.Next() { + var i SampleSandboxMetricsRow + if err := rows.Scan( + &i.TeamID, + &i.RunningCount, + &i.VcpusReserved, + &i.MemoryMbReserved, + ); err != nil { + return nil, err + } + items = append(items, i) + } + if err := rows.Err(); err != nil { + return nil, err + } + return items, nil +} diff --git a/internal/db/models.go b/internal/db/models.go index 00cbf70..0128f4a 100644 --- a/internal/db/models.go +++ b/internal/db/models.go @@ -99,6 +99,24 @@ type Sandbox struct { TeamID string `json:"team_id"` } +type SandboxMetricPoint struct { + SandboxID string `json:"sandbox_id"` + Tier string `json:"tier"` + Ts int64 `json:"ts"` + CpuPct float64 `json:"cpu_pct"` + MemBytes int64 `json:"mem_bytes"` + DiskBytes int64 `json:"disk_bytes"` +} + +type SandboxMetricsSnapshot struct { + ID int64 `json:"id"` + TeamID string `json:"team_id"` + SampledAt pgtype.Timestamptz `json:"sampled_at"` + RunningCount int32 `json:"running_count"` + VcpusReserved int32 `json:"vcpus_reserved"` + MemoryMbReserved int32 `json:"memory_mb_reserved"` +} + type Team struct { ID string `json:"id"` Name string `json:"name"` diff --git a/internal/hostagent/server.go b/internal/hostagent/server.go index c0a4cfd..fb7fb66 100644 --- a/internal/hostagent/server.go +++ b/internal/hostagent/server.go @@ -426,3 +426,55 @@ func (s *Server) Terminate( } return connect.NewResponse(&pb.TerminateResponse{}), nil } + +func (s *Server) GetSandboxMetrics( + _ context.Context, + req *connect.Request[pb.GetSandboxMetricsRequest], +) (*connect.Response[pb.GetSandboxMetricsResponse], error) { + msg := req.Msg + + points, err := s.mgr.GetMetrics(msg.SandboxId, msg.Range) + if err != nil { + if strings.Contains(err.Error(), "not found") { + return nil, connect.NewError(connect.CodeNotFound, err) + } + if strings.Contains(err.Error(), "invalid range") { + return nil, connect.NewError(connect.CodeInvalidArgument, err) + } + return nil, connect.NewError(connect.CodeInternal, err) + } + + return connect.NewResponse(&pb.GetSandboxMetricsResponse{Points: metricPointsToPB(points)}), nil +} + +func (s *Server) FlushSandboxMetrics( + _ context.Context, + req *connect.Request[pb.FlushSandboxMetricsRequest], +) (*connect.Response[pb.FlushSandboxMetricsResponse], error) { + pts10m, pts2h, pts24h, err := s.mgr.FlushMetrics(req.Msg.SandboxId) + if err != nil { + if strings.Contains(err.Error(), "not found") { + return nil, connect.NewError(connect.CodeNotFound, err) + } + return nil, connect.NewError(connect.CodeInternal, err) + } + + return connect.NewResponse(&pb.FlushSandboxMetricsResponse{ + Points_10M: metricPointsToPB(pts10m), + Points_2H: metricPointsToPB(pts2h), + Points_24H: metricPointsToPB(pts24h), + }), nil +} + +func metricPointsToPB(pts []sandbox.MetricPoint) []*pb.MetricPoint { + out := make([]*pb.MetricPoint, len(pts)) + for i, p := range pts { + out[i] = &pb.MetricPoint{ + TimestampUnix: p.Timestamp.Unix(), + CpuPct: p.CPUPct, + MemBytes: p.MemBytes, + DiskBytes: p.DiskBytes, + } + } + return out +} diff --git a/internal/sandbox/manager.go b/internal/sandbox/manager.go index bf7d057..9a795b5 100644 --- a/internal/sandbox/manager.go +++ b/internal/sandbox/manager.go @@ -58,6 +58,12 @@ type sandboxState struct { // sandbox was restored. Non-nil means re-pause should use "Diff" snapshot // type instead of "Full", avoiding the UFFD fault-in storm. parent *snapshotParent + + // Metrics sampling state. + fcPID int // Firecracker process PID (child of unshare wrapper) + ring *metricsRing // tiered ring buffers for CPU/mem/disk metrics + samplerCancel context.CancelFunc // cancels the per-sandbox sampling goroutine + samplerDone chan struct{} // closed when the sampling goroutine exits } // snapshotParent stores the previous generation's snapshot state so that @@ -232,6 +238,8 @@ func (m *Manager) Create(ctx context.Context, sandboxID, template string, vcpus, m.boxes[sandboxID] = sb m.mu.Unlock() + m.startSampler(sb) + slog.Info("sandbox created", "id", sandboxID, "template", template, @@ -265,6 +273,7 @@ func (m *Manager) Destroy(ctx context.Context, sandboxID string) error { // cleanup tears down all resources for a sandbox. func (m *Manager) cleanup(ctx context.Context, sb *sandboxState) { + m.stopSampler(sb) if err := m.vm.Destroy(ctx, sb.ID); err != nil { slog.Warn("vm destroy error", "id", sb.ID, "error", err) } @@ -668,6 +677,8 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string, timeoutSec int) m.boxes[sandboxID] = sb m.mu.Unlock() + m.startSampler(sb) + // Don't delete snapshot dir — diff files are needed for re-pause. // The CoW file was already moved out. The dir will be cleaned up // on destroy or overwritten on re-pause. @@ -987,6 +998,8 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID, snapshotNam m.boxes[sandboxID] = sb m.mu.Unlock() + m.startSampler(sb) + slog.Info("sandbox created from snapshot", "id", sandboxID, "snapshot", snapshotName, @@ -1213,6 +1226,177 @@ func warnErr(msg string, id string, err error) { } } +// startSampler resolves the Firecracker PID and starts a background goroutine +// that samples CPU/mem/disk at 500ms intervals into the ring buffer. +// Must be called after the sandbox is registered in m.boxes. +func (m *Manager) startSampler(sb *sandboxState) { + v, ok := m.vm.Get(sb.ID) + if !ok { + slog.Warn("metrics: VM not found, skipping sampler", "id", sb.ID) + return + } + + // v.PID() is the cmd.Process.Pid of the "unshare -m -- bash -c script" + // invocation. Because unshare(2) modifies the current process's namespace + // before exec-replacing itself with bash, and bash exec-replaces itself + // with ip-netns-exec, which exec-replaces itself with firecracker, the + // entire exec chain occupies the same PID. v.PID() IS the Firecracker PID. + fcPID := v.PID() + + sb.fcPID = fcPID + sb.ring = newMetricsRing() + + ctx, cancel := context.WithCancel(context.Background()) + sb.samplerCancel = cancel + sb.samplerDone = make(chan struct{}) + + // Read initial CPU counters for delta calculation. + // Passed to goroutine as local state — no shared mutation. + initialCPU, err := readCPUStat(fcPID) + if err != nil { + slog.Warn("metrics: could not read initial CPU stat", "id", sb.ID, "error", err) + } + + go m.samplerLoop(ctx, sb, fcPID, sb.VCPUs, initialCPU) +} + +// samplerLoop samples /proc metrics at 500ms intervals. +// lastCPU is goroutine-local to avoid shared-state races. +func (m *Manager) samplerLoop(ctx context.Context, sb *sandboxState, fcPID, vcpus int, lastCPU cpuStat) { + defer close(sb.samplerDone) + + ticker := time.NewTicker(500 * time.Millisecond) + defer ticker.Stop() + + clkTck := 100.0 // sysconf(_SC_CLK_TCK), almost always 100 on Linux + lastTime := time.Now() + cpuInitialized := lastCPU != (cpuStat{}) + + for { + select { + case <-ctx.Done(): + return + case now := <-ticker.C: + elapsed := now.Sub(lastTime).Seconds() + lastTime = now + + // CPU: delta jiffies / (elapsed * CLK_TCK * vcpus) * 100 + var cpuPct float64 + cur, err := readCPUStat(fcPID) + if err == nil { + if cpuInitialized && elapsed > 0 && vcpus > 0 { + deltaJiffies := float64((cur.utime + cur.stime) - (lastCPU.utime + lastCPU.stime)) + cpuPct = (deltaJiffies / (elapsed * clkTck * float64(vcpus))) * 100.0 + if cpuPct > 100.0 { + cpuPct = 100.0 + } + if cpuPct < 0 { + cpuPct = 0 + } + } + lastCPU = cur + cpuInitialized = true + } + + // Memory: VmRSS of the Firecracker process. + memBytes, _ := readMemRSS(fcPID) + + // Disk: allocated bytes of the CoW sparse file. + var diskBytes int64 + if sb.dmDevice != nil { + diskBytes, _ = readDiskAllocated(sb.dmDevice.CowPath) + } + + sb.ring.Push(MetricPoint{ + Timestamp: now, + CPUPct: cpuPct, + MemBytes: memBytes, + DiskBytes: diskBytes, + }) + } + } +} + +// stopSampler stops the metrics sampling goroutine and waits for it to exit. +func (m *Manager) stopSampler(sb *sandboxState) { + if sb.samplerCancel != nil { + sb.samplerCancel() + <-sb.samplerDone + sb.samplerCancel = nil + } +} + +// GetMetrics returns the ring buffer data for the given range tier. +// Valid ranges: "10m", "2h", "24h". +func (m *Manager) GetMetrics(sandboxID, rangeTier string) ([]MetricPoint, error) { + m.mu.RLock() + sb, ok := m.boxes[sandboxID] + m.mu.RUnlock() + if !ok { + return nil, fmt.Errorf("sandbox not found: %s", sandboxID) + } + if sb.ring == nil { + return nil, nil + } + + // Map the requested range to the appropriate ring tier and time cutoff. + var points []MetricPoint + var cutoff time.Duration + switch rangeTier { + case "5m": + points = sb.ring.Get10m() + cutoff = 5 * time.Minute + case "10m": + points = sb.ring.Get10m() + cutoff = 10 * time.Minute + case "1h": + points = sb.ring.Get2h() + cutoff = 1 * time.Hour + case "2h": + points = sb.ring.Get2h() + cutoff = 2 * time.Hour + case "6h": + points = sb.ring.Get24h() + cutoff = 6 * time.Hour + case "12h": + points = sb.ring.Get24h() + cutoff = 12 * time.Hour + case "24h": + points = sb.ring.Get24h() + cutoff = 24 * time.Hour + default: + return nil, fmt.Errorf("invalid range: %s (valid: 5m, 10m, 1h, 2h, 6h, 12h, 24h)", rangeTier) + } + + // Filter points to the requested time window. + threshold := time.Now().Add(-cutoff) + filtered := points[:0:0] + for _, p := range points { + if !p.Timestamp.Before(threshold) { + filtered = append(filtered, p) + } + } + return filtered, nil +} + +// FlushMetrics returns all three tier ring buffers, clears the ring, and +// stops the sampler goroutine. Called by the control plane before pause/destroy. +func (m *Manager) FlushMetrics(sandboxID string) (pts10m, pts2h, pts24h []MetricPoint, err error) { + m.mu.RLock() + sb, ok := m.boxes[sandboxID] + m.mu.RUnlock() + if !ok { + return nil, nil, nil, fmt.Errorf("sandbox not found: %s", sandboxID) + } + + m.stopSampler(sb) + if sb.ring == nil { + return nil, nil, nil, nil + } + pts10m, pts2h, pts24h = sb.ring.Flush() + return pts10m, pts2h, pts24h, nil +} + // copyFile copies a regular file from src to dst using streaming I/O. func copyFile(src, dst string) error { sf, err := os.Open(src) diff --git a/internal/sandbox/metrics.go b/internal/sandbox/metrics.go new file mode 100644 index 0000000..f266cb2 --- /dev/null +++ b/internal/sandbox/metrics.go @@ -0,0 +1,178 @@ +package sandbox + +import ( + "sync" + "time" +) + +// MetricPoint holds one metrics sample. +type MetricPoint struct { + Timestamp time.Time + CPUPct float64 + MemBytes int64 + DiskBytes int64 +} + +// Ring buffer capacity constants. +const ( + ring10mCap = 1200 // 500ms × 1200 = 10 min + ring2hCap = 240 // 30s × 240 = 2 h + ring24hCap = 288 // 5min × 288 = 24 h + + downsample2hEvery = 60 // 60 × 500ms = 30s + downsample24hEvery = 10 // 10 × 30s = 5min +) + +// metricsRing holds three tiered ring buffers with automatic downsampling +// from the finest tier into coarser tiers. +type metricsRing struct { + mu sync.Mutex + + // 10-minute tier: 500ms samples. + buf10m [ring10mCap]MetricPoint + idx10m int + count10m int + + // 2-hour tier: 30s averages. + buf2h [ring2hCap]MetricPoint + idx2h int + count2h int + + // 24-hour tier: 5min averages. + buf24h [ring24hCap]MetricPoint + idx24h int + count24h int + + // Accumulators for downsampling. + acc500ms [downsample2hEvery]MetricPoint + acc500msN int + + acc30s [downsample24hEvery]MetricPoint + acc30sN int +} + +// newMetricsRing creates an empty metrics ring buffer. +func newMetricsRing() *metricsRing { + return &metricsRing{} +} + +// Push adds a 500ms sample to the finest tier and triggers downsampling +// into coarser tiers when enough samples have accumulated. +func (r *metricsRing) Push(p MetricPoint) { + r.mu.Lock() + defer r.mu.Unlock() + + // Write to 10m ring. + r.buf10m[r.idx10m] = p + r.idx10m = (r.idx10m + 1) % ring10mCap + if r.count10m < ring10mCap { + r.count10m++ + } + + // Accumulate for 2h downsample. + r.acc500ms[r.acc500msN] = p + r.acc500msN++ + if r.acc500msN == downsample2hEvery { + avg := averagePoints(r.acc500ms[:downsample2hEvery]) + r.push2h(avg) + r.acc500msN = 0 + } +} + +func (r *metricsRing) push2h(p MetricPoint) { + r.buf2h[r.idx2h] = p + r.idx2h = (r.idx2h + 1) % ring2hCap + if r.count2h < ring2hCap { + r.count2h++ + } + + // Accumulate for 24h downsample. + r.acc30s[r.acc30sN] = p + r.acc30sN++ + if r.acc30sN == downsample24hEvery { + avg := averagePoints(r.acc30s[:downsample24hEvery]) + r.push24h(avg) + r.acc30sN = 0 + } +} + +func (r *metricsRing) push24h(p MetricPoint) { + r.buf24h[r.idx24h] = p + r.idx24h = (r.idx24h + 1) % ring24hCap + if r.count24h < ring24hCap { + r.count24h++ + } +} + +// Get10m returns the 10-minute tier points in chronological order. +func (r *metricsRing) Get10m() []MetricPoint { + r.mu.Lock() + defer r.mu.Unlock() + return r.readRing(r.buf10m[:], r.idx10m, r.count10m) +} + +// Get2h returns the 2-hour tier points in chronological order. +func (r *metricsRing) Get2h() []MetricPoint { + r.mu.Lock() + defer r.mu.Unlock() + return r.readRing(r.buf2h[:], r.idx2h, r.count2h) +} + +// Get24h returns the 24-hour tier points in chronological order. +func (r *metricsRing) Get24h() []MetricPoint { + r.mu.Lock() + defer r.mu.Unlock() + return r.readRing(r.buf24h[:], r.idx24h, r.count24h) +} + +// Flush returns all three tiers and resets the ring buffer. +func (r *metricsRing) Flush() (pts10m, pts2h, pts24h []MetricPoint) { + r.mu.Lock() + defer r.mu.Unlock() + + pts10m = r.readRing(r.buf10m[:], r.idx10m, r.count10m) + pts2h = r.readRing(r.buf2h[:], r.idx2h, r.count2h) + pts24h = r.readRing(r.buf24h[:], r.idx24h, r.count24h) + + // Reset all state. + r.idx10m, r.count10m = 0, 0 + r.idx2h, r.count2h = 0, 0 + r.idx24h, r.count24h = 0, 0 + r.acc500msN = 0 + r.acc30sN = 0 + + return pts10m, pts2h, pts24h +} + +// readRing extracts elements from a circular buffer in chronological order. +func (r *metricsRing) readRing(buf []MetricPoint, nextIdx, count int) []MetricPoint { + if count == 0 { + return nil + } + result := make([]MetricPoint, count) + bufLen := len(buf) + start := (nextIdx - count + bufLen) % bufLen + for i := range count { + result[i] = buf[(start+i)%bufLen] + } + return result +} + +// averagePoints computes the average of a slice of MetricPoints. +// The timestamp is set to the last point's timestamp. +func averagePoints(pts []MetricPoint) MetricPoint { + n := float64(len(pts)) + var cpu float64 + var mem, disk int64 + for _, p := range pts { + cpu += p.CPUPct + mem += p.MemBytes + disk += p.DiskBytes + } + return MetricPoint{ + Timestamp: pts[len(pts)-1].Timestamp, + CPUPct: cpu / n, + MemBytes: int64(float64(mem) / n), + DiskBytes: int64(float64(disk) / n), + } +} diff --git a/internal/sandbox/proc.go b/internal/sandbox/proc.go new file mode 100644 index 0000000..855d3c1 --- /dev/null +++ b/internal/sandbox/proc.go @@ -0,0 +1,83 @@ +package sandbox + +import ( + "fmt" + "os" + "strconv" + "strings" + "syscall" +) + +// cpuStat holds raw CPU jiffies read from /proc/{pid}/stat. +type cpuStat struct { + utime uint64 + stime uint64 +} + +// readCPUStat reads user and system CPU jiffies from /proc/{pid}/stat. +// Fields 14 (utime) and 15 (stime) are 1-indexed in the man page; +// after splitting on space, they are at indices 13 and 14. +func readCPUStat(pid int) (cpuStat, error) { + path := fmt.Sprintf("/proc/%d/stat", pid) + data, err := os.ReadFile(path) + if err != nil { + return cpuStat{}, fmt.Errorf("read stat: %w", err) + } + + // /proc/{pid}/stat format: pid (comm) state fields... + // The comm field may contain spaces and parens, so find the last ')' first. + content := string(data) + idx := strings.LastIndex(content, ")") + if idx < 0 { + return cpuStat{}, fmt.Errorf("malformed /proc/%d/stat: no closing paren", pid) + } + // After ")" there is " state field3 field4 ... fieldN" + // field1 after ')' is state (index 0), utime is field 11, stime is field 12 + // (0-indexed from after the closing paren). + fields := strings.Fields(content[idx+2:]) + if len(fields) < 13 { + return cpuStat{}, fmt.Errorf("malformed /proc/%d/stat: too few fields (%d)", pid, len(fields)) + } + utime, err := strconv.ParseUint(fields[11], 10, 64) + if err != nil { + return cpuStat{}, fmt.Errorf("parse utime: %w", err) + } + stime, err := strconv.ParseUint(fields[12], 10, 64) + if err != nil { + return cpuStat{}, fmt.Errorf("parse stime: %w", err) + } + return cpuStat{utime: utime, stime: stime}, nil +} + +// readMemRSS reads VmRSS from /proc/{pid}/status and returns bytes. +func readMemRSS(pid int) (int64, error) { + path := fmt.Sprintf("/proc/%d/status", pid) + data, err := os.ReadFile(path) + if err != nil { + return 0, fmt.Errorf("read status: %w", err) + } + for _, line := range strings.Split(string(data), "\n") { + if strings.HasPrefix(line, "VmRSS:") { + fields := strings.Fields(line) + if len(fields) < 2 { + return 0, fmt.Errorf("malformed VmRSS line") + } + kb, err := strconv.ParseInt(fields[1], 10, 64) + if err != nil { + return 0, fmt.Errorf("parse VmRSS: %w", err) + } + return kb * 1024, nil + } + } + return 0, fmt.Errorf("VmRSS not found in /proc/%d/status", pid) +} + +// readDiskAllocated returns the actual allocated bytes (not apparent size) +// of the file at path. This uses stat's block count × 512. +func readDiskAllocated(path string) (int64, error) { + var stat syscall.Stat_t + if err := syscall.Stat(path, &stat); err != nil { + return 0, fmt.Errorf("stat %s: %w", path, err) + } + return stat.Blocks * 512, nil +} diff --git a/internal/service/sandbox.go b/internal/service/sandbox.go index f67eb0d..142b9bd 100644 --- a/internal/service/sandbox.go +++ b/internal/service/sandbox.go @@ -58,6 +58,8 @@ type hostagentClient = interface { PauseSandbox(ctx context.Context, req *connect.Request[pb.PauseSandboxRequest]) (*connect.Response[pb.PauseSandboxResponse], error) ResumeSandbox(ctx context.Context, req *connect.Request[pb.ResumeSandboxRequest]) (*connect.Response[pb.ResumeSandboxResponse], error) PingSandbox(ctx context.Context, req *connect.Request[pb.PingSandboxRequest]) (*connect.Response[pb.PingSandboxResponse], error) + GetSandboxMetrics(ctx context.Context, req *connect.Request[pb.GetSandboxMetricsRequest]) (*connect.Response[pb.GetSandboxMetricsResponse], error) + FlushSandboxMetrics(ctx context.Context, req *connect.Request[pb.FlushSandboxMetricsRequest]) (*connect.Response[pb.FlushSandboxMetricsResponse], error) } // Create creates a new sandbox: picks a host via the scheduler, inserts a pending @@ -180,6 +182,9 @@ func (s *SandboxService) Pause(ctx context.Context, sandboxID, teamID string) (d return db.Sandbox{}, err } + // Flush all metrics tiers before pausing so data survives in DB. + s.flushAndPersistMetrics(ctx, agent, sandboxID, true) + if _, err := agent.PauseSandbox(ctx, connect.NewRequest(&pb.PauseSandboxRequest{ SandboxId: sandboxID, })); err != nil { @@ -236,7 +241,8 @@ func (s *SandboxService) Resume(ctx context.Context, sandboxID, teamID string) ( // Destroy stops a sandbox and marks it as stopped. func (s *SandboxService) Destroy(ctx context.Context, sandboxID, teamID string) error { - if _, err := s.DB.GetSandboxByTeam(ctx, db.GetSandboxByTeamParams{ID: sandboxID, TeamID: teamID}); err != nil { + sb, err := s.DB.GetSandboxByTeam(ctx, db.GetSandboxByTeamParams{ID: sandboxID, TeamID: teamID}) + if err != nil { return fmt.Errorf("sandbox not found: %w", err) } @@ -245,6 +251,11 @@ func (s *SandboxService) Destroy(ctx context.Context, sandboxID, teamID string) return err } + // If running, flush 24h tier metrics for analytics before destroying. + if sb.Status == "running" { + s.flushAndPersistMetrics(ctx, agent, sandboxID, false) + } + // Destroy on host agent. A not-found response is fine — sandbox is already gone. if _, err := agent.DestroySandbox(ctx, connect.NewRequest(&pb.DestroySandboxRequest{ SandboxId: sandboxID, @@ -252,6 +263,16 @@ func (s *SandboxService) Destroy(ctx context.Context, sandboxID, teamID string) return fmt.Errorf("agent destroy: %w", err) } + // For a paused sandbox, only keep 24h tier; remove the finer-grained tiers. + if sb.Status == "paused" { + _ = s.DB.DeleteSandboxMetricPointsByTier(ctx, db.DeleteSandboxMetricPointsByTierParams{ + SandboxID: sandboxID, Tier: "10m", + }) + _ = s.DB.DeleteSandboxMetricPointsByTier(ctx, db.DeleteSandboxMetricPointsByTierParams{ + SandboxID: sandboxID, Tier: "2h", + }) + } + if _, err := s.DB.UpdateSandboxStatus(ctx, db.UpdateSandboxStatusParams{ ID: sandboxID, Status: "stopped", }); err != nil { @@ -260,6 +281,41 @@ func (s *SandboxService) Destroy(ctx context.Context, sandboxID, teamID string) return nil } +// flushAndPersistMetrics calls FlushSandboxMetrics on the agent and stores +// the returned data to DB. If allTiers is true, all three tiers are saved; +// otherwise only the 24h tier (for post-destroy analytics). +func (s *SandboxService) flushAndPersistMetrics(ctx context.Context, agent hostagentClient, sandboxID string, allTiers bool) { + resp, err := agent.FlushSandboxMetrics(ctx, connect.NewRequest(&pb.FlushSandboxMetricsRequest{ + SandboxId: sandboxID, + })) + if err != nil { + slog.Warn("flush metrics failed (best-effort)", "sandbox_id", sandboxID, "error", err) + return + } + msg := resp.Msg + + if allTiers { + s.persistMetricPoints(ctx, sandboxID, "10m", msg.Points_10M) + s.persistMetricPoints(ctx, sandboxID, "2h", msg.Points_2H) + } + s.persistMetricPoints(ctx, sandboxID, "24h", msg.Points_24H) +} + +func (s *SandboxService) persistMetricPoints(ctx context.Context, sandboxID, tier string, points []*pb.MetricPoint) { + for _, p := range points { + if err := s.DB.InsertSandboxMetricPoint(ctx, db.InsertSandboxMetricPointParams{ + SandboxID: sandboxID, + Tier: tier, + Ts: p.TimestampUnix, + CpuPct: p.CpuPct, + MemBytes: p.MemBytes, + DiskBytes: p.DiskBytes, + }); err != nil { + slog.Warn("persist metric point failed", "sandbox_id", sandboxID, "tier", tier, "error", err) + } + } +} + // Ping resets the inactivity timer for a running sandbox. func (s *SandboxService) Ping(ctx context.Context, sandboxID, teamID string) error { sb, err := s.DB.GetSandboxByTeam(ctx, db.GetSandboxByTeamParams{ID: sandboxID, TeamID: teamID}) diff --git a/internal/service/stats.go b/internal/service/stats.go new file mode 100644 index 0000000..1a075aa --- /dev/null +++ b/internal/service/stats.go @@ -0,0 +1,153 @@ +package service + +import ( + "context" + "errors" + "fmt" + "time" + + "github.com/jackc/pgx/v5" + "github.com/jackc/pgx/v5/pgxpool" + + "git.omukk.dev/wrenn/sandbox/internal/db" +) + +// TimeRange identifies a chart time window. +type TimeRange string + +const ( + Range5m TimeRange = "5m" + Range1h TimeRange = "1h" + Range6h TimeRange = "6h" + Range24h TimeRange = "24h" + Range30d TimeRange = "30d" +) + +type rangeConfig struct { + bucketSec int // bucket width in seconds for time-series aggregation + intervalLiteral string // PostgreSQL interval literal for the lookback window +} + +var rangeConfigs = map[TimeRange]rangeConfig{ + Range5m: {bucketSec: 3, intervalLiteral: "5 minutes"}, + Range1h: {bucketSec: 30, intervalLiteral: "1 hour"}, + Range6h: {bucketSec: 180, intervalLiteral: "6 hours"}, + Range24h: {bucketSec: 720, intervalLiteral: "24 hours"}, + Range30d: {bucketSec: 21600, intervalLiteral: "30 days"}, +} + +// ValidRange returns true if r is a known TimeRange value. +func ValidRange(r TimeRange) bool { + _, ok := rangeConfigs[r] + return ok +} + +// StatPoint is one bucketed data point in the time-series. +type StatPoint struct { + Bucket time.Time + RunningCount int32 + VCPUsReserved int32 + MemoryMBReserved int32 +} + +// CurrentStats holds the live values for a team, read directly from sandboxes. +type CurrentStats struct { + RunningCount int32 + VCPUsReserved int32 + MemoryMBReserved int32 +} + +// PeakStats holds the 30-day maximum values for a team. +type PeakStats struct { + RunningCount int32 + VCPUs int32 + MemoryMB int32 +} + +// StatsService computes sandbox metrics for the dashboard. +type StatsService struct { + DB *db.Queries + Pool *pgxpool.Pool +} + +// GetStats returns current stats, 30-day peaks, and a time-series for the +// given team and time range. If no snapshots exist yet, zeros are returned. +func (s *StatsService) GetStats(ctx context.Context, teamID string, r TimeRange) (CurrentStats, PeakStats, []StatPoint, error) { + cfg, ok := rangeConfigs[r] + if !ok { + return CurrentStats{}, PeakStats{}, nil, fmt.Errorf("unknown range: %s", r) + } + + // Current live values — read directly from sandboxes so we always reflect + // the true state even when no capsules are running. + cur, err := s.DB.GetLiveMetrics(ctx, teamID) + if err != nil { + return CurrentStats{}, PeakStats{}, nil, fmt.Errorf("get live metrics: %w", err) + } + current := CurrentStats{ + RunningCount: cur.RunningCount, + VCPUsReserved: cur.VcpusReserved, + MemoryMBReserved: cur.MemoryMbReserved, + } + + // 30-day peaks. + var peaks PeakStats + pk, err := s.DB.GetPeakMetrics(ctx, teamID) + if err != nil && !errors.Is(err, pgx.ErrNoRows) { + return CurrentStats{}, PeakStats{}, nil, fmt.Errorf("get peak metrics: %w", err) + } + if err == nil { + peaks = PeakStats{ + RunningCount: pk.PeakRunningCount, + VCPUs: pk.PeakVcpus, + MemoryMB: pk.PeakMemoryMb, + } + } + + // Time-series — dynamic bucket width, executed via pgx directly. + series, err := s.queryTimeSeries(ctx, teamID, cfg) + if err != nil { + return CurrentStats{}, PeakStats{}, nil, fmt.Errorf("get time series: %w", err) + } + + return current, peaks, series, nil +} + +// timeSeriesSQL uses an epoch-floor trick to bucket rows by an arbitrary +// integer number of seconds without requiring TimescaleDB. +// +// $1 = bucket width in seconds (integer) +// $2 = team_id +// $3 = lookback interval literal (e.g. '1 hour') +const timeSeriesSQL = ` +SELECT + to_timestamp(floor(extract(epoch FROM sampled_at) / $1) * $1) AS bucket, + AVG(running_count)::INTEGER AS running_count, + AVG(vcpus_reserved)::INTEGER AS vcpus_reserved, + AVG(memory_mb_reserved)::INTEGER AS memory_mb_reserved +FROM sandbox_metrics_snapshots +WHERE team_id = $2 + AND sampled_at >= NOW() - $3::INTERVAL +GROUP BY bucket +ORDER BY bucket ASC +` + +func (s *StatsService) queryTimeSeries(ctx context.Context, teamID string, cfg rangeConfig) ([]StatPoint, error) { + rows, err := s.Pool.Query(ctx, timeSeriesSQL, cfg.bucketSec, teamID, cfg.intervalLiteral) + if err != nil { + return nil, err + } + defer rows.Close() + + var points []StatPoint + for rows.Next() { + var p StatPoint + var bucket time.Time + if err := rows.Scan(&bucket, &p.RunningCount, &p.VCPUsReserved, &p.MemoryMBReserved); err != nil { + return nil, err + } + p.Bucket = bucket + points = append(points, p) + } + return points, rows.Err() +} diff --git a/internal/service/team.go b/internal/service/team.go index 0c1739e..d4c911c 100644 --- a/internal/service/team.go +++ b/internal/service/team.go @@ -9,7 +9,6 @@ import ( "connectrpc.com/connect" "github.com/jackc/pgx/v5" - "github.com/jackc/pgx/v5/pgtype" "github.com/jackc/pgx/v5/pgxpool" "git.omukk.dev/wrenn/sandbox/internal/db" @@ -369,12 +368,6 @@ func (s *TeamService) LeaveTeam(ctx context.Context, teamID, callerUserID string return nil } -// SearchUsersByEmailPrefix returns up to 10 users whose email starts with the given prefix. -// The prefix must contain "@" to prevent broad enumeration. -func (s *TeamService) SearchUsersByEmailPrefix(ctx context.Context, prefix string) ([]db.SearchUsersByEmailPrefixRow, error) { - return s.DB.SearchUsersByEmailPrefix(ctx, pgtype.Text{String: prefix, Valid: true}) -} - // SetBYOC enables the BYOC feature flag for a team. Once enabled, BYOC cannot // be disabled — it is a one-way transition. // Admin-only — the caller must verify admin status before invoking this. diff --git a/internal/vm/manager.go b/internal/vm/manager.go index b68bde1..c7e3479 100644 --- a/internal/vm/manager.go +++ b/internal/vm/manager.go @@ -250,6 +250,12 @@ func (m *Manager) CreateFromSnapshot(ctx context.Context, cfg VMConfig, snapPath return vm, nil } +// PID returns the process ID of the unshare wrapper process. +// The actual Firecracker process is a direct child of this PID. +func (v *VM) PID() int { + return v.process.cmd.Process.Pid +} + // Get returns a running VM by sandbox ID. func (m *Manager) Get(sandboxID string) (*VM, bool) { vm, ok := m.vms[sandboxID] diff --git a/proto/hostagent/gen/hostagent.pb.go b/proto/hostagent/gen/hostagent.pb.go index 7afd4d1..f496b2c 100644 --- a/proto/hostagent/gen/hostagent.pb.go +++ b/proto/hostagent/gen/hostagent.pb.go @@ -1902,6 +1902,275 @@ func (*TerminateResponse) Descriptor() ([]byte, []int) { return file_hostagent_proto_rawDescGZIP(), []int{34} } +type MetricPoint struct { + state protoimpl.MessageState `protogen:"open.v1"` + TimestampUnix int64 `protobuf:"varint,1,opt,name=timestamp_unix,json=timestampUnix,proto3" json:"timestamp_unix,omitempty"` + CpuPct float64 `protobuf:"fixed64,2,opt,name=cpu_pct,json=cpuPct,proto3" json:"cpu_pct,omitempty"` + MemBytes int64 `protobuf:"varint,3,opt,name=mem_bytes,json=memBytes,proto3" json:"mem_bytes,omitempty"` + DiskBytes int64 `protobuf:"varint,4,opt,name=disk_bytes,json=diskBytes,proto3" json:"disk_bytes,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *MetricPoint) Reset() { + *x = MetricPoint{} + mi := &file_hostagent_proto_msgTypes[35] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *MetricPoint) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*MetricPoint) ProtoMessage() {} + +func (x *MetricPoint) ProtoReflect() protoreflect.Message { + mi := &file_hostagent_proto_msgTypes[35] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use MetricPoint.ProtoReflect.Descriptor instead. +func (*MetricPoint) Descriptor() ([]byte, []int) { + return file_hostagent_proto_rawDescGZIP(), []int{35} +} + +func (x *MetricPoint) GetTimestampUnix() int64 { + if x != nil { + return x.TimestampUnix + } + return 0 +} + +func (x *MetricPoint) GetCpuPct() float64 { + if x != nil { + return x.CpuPct + } + return 0 +} + +func (x *MetricPoint) GetMemBytes() int64 { + if x != nil { + return x.MemBytes + } + return 0 +} + +func (x *MetricPoint) GetDiskBytes() int64 { + if x != nil { + return x.DiskBytes + } + return 0 +} + +type GetSandboxMetricsRequest struct { + state protoimpl.MessageState `protogen:"open.v1"` + SandboxId string `protobuf:"bytes,1,opt,name=sandbox_id,json=sandboxId,proto3" json:"sandbox_id,omitempty"` + // Range tier: "10m", "2h", or "24h". + Range string `protobuf:"bytes,2,opt,name=range,proto3" json:"range,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *GetSandboxMetricsRequest) Reset() { + *x = GetSandboxMetricsRequest{} + mi := &file_hostagent_proto_msgTypes[36] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *GetSandboxMetricsRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*GetSandboxMetricsRequest) ProtoMessage() {} + +func (x *GetSandboxMetricsRequest) ProtoReflect() protoreflect.Message { + mi := &file_hostagent_proto_msgTypes[36] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use GetSandboxMetricsRequest.ProtoReflect.Descriptor instead. +func (*GetSandboxMetricsRequest) Descriptor() ([]byte, []int) { + return file_hostagent_proto_rawDescGZIP(), []int{36} +} + +func (x *GetSandboxMetricsRequest) GetSandboxId() string { + if x != nil { + return x.SandboxId + } + return "" +} + +func (x *GetSandboxMetricsRequest) GetRange() string { + if x != nil { + return x.Range + } + return "" +} + +type GetSandboxMetricsResponse struct { + state protoimpl.MessageState `protogen:"open.v1"` + Points []*MetricPoint `protobuf:"bytes,1,rep,name=points,proto3" json:"points,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *GetSandboxMetricsResponse) Reset() { + *x = GetSandboxMetricsResponse{} + mi := &file_hostagent_proto_msgTypes[37] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *GetSandboxMetricsResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*GetSandboxMetricsResponse) ProtoMessage() {} + +func (x *GetSandboxMetricsResponse) ProtoReflect() protoreflect.Message { + mi := &file_hostagent_proto_msgTypes[37] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use GetSandboxMetricsResponse.ProtoReflect.Descriptor instead. +func (*GetSandboxMetricsResponse) Descriptor() ([]byte, []int) { + return file_hostagent_proto_rawDescGZIP(), []int{37} +} + +func (x *GetSandboxMetricsResponse) GetPoints() []*MetricPoint { + if x != nil { + return x.Points + } + return nil +} + +type FlushSandboxMetricsRequest struct { + state protoimpl.MessageState `protogen:"open.v1"` + SandboxId string `protobuf:"bytes,1,opt,name=sandbox_id,json=sandboxId,proto3" json:"sandbox_id,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *FlushSandboxMetricsRequest) Reset() { + *x = FlushSandboxMetricsRequest{} + mi := &file_hostagent_proto_msgTypes[38] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *FlushSandboxMetricsRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*FlushSandboxMetricsRequest) ProtoMessage() {} + +func (x *FlushSandboxMetricsRequest) ProtoReflect() protoreflect.Message { + mi := &file_hostagent_proto_msgTypes[38] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use FlushSandboxMetricsRequest.ProtoReflect.Descriptor instead. +func (*FlushSandboxMetricsRequest) Descriptor() ([]byte, []int) { + return file_hostagent_proto_rawDescGZIP(), []int{38} +} + +func (x *FlushSandboxMetricsRequest) GetSandboxId() string { + if x != nil { + return x.SandboxId + } + return "" +} + +type FlushSandboxMetricsResponse struct { + state protoimpl.MessageState `protogen:"open.v1"` + Points_10M []*MetricPoint `protobuf:"bytes,1,rep,name=points_10m,json=points10m,proto3" json:"points_10m,omitempty"` + Points_2H []*MetricPoint `protobuf:"bytes,2,rep,name=points_2h,json=points2h,proto3" json:"points_2h,omitempty"` + Points_24H []*MetricPoint `protobuf:"bytes,3,rep,name=points_24h,json=points24h,proto3" json:"points_24h,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *FlushSandboxMetricsResponse) Reset() { + *x = FlushSandboxMetricsResponse{} + mi := &file_hostagent_proto_msgTypes[39] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *FlushSandboxMetricsResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*FlushSandboxMetricsResponse) ProtoMessage() {} + +func (x *FlushSandboxMetricsResponse) ProtoReflect() protoreflect.Message { + mi := &file_hostagent_proto_msgTypes[39] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use FlushSandboxMetricsResponse.ProtoReflect.Descriptor instead. +func (*FlushSandboxMetricsResponse) Descriptor() ([]byte, []int) { + return file_hostagent_proto_rawDescGZIP(), []int{39} +} + +func (x *FlushSandboxMetricsResponse) GetPoints_10M() []*MetricPoint { + if x != nil { + return x.Points_10M + } + return nil +} + +func (x *FlushSandboxMetricsResponse) GetPoints_2H() []*MetricPoint { + if x != nil { + return x.Points_2H + } + return nil +} + +func (x *FlushSandboxMetricsResponse) GetPoints_24H() []*MetricPoint { + if x != nil { + return x.Points_24H + } + return nil +} + var File_hostagent_proto protoreflect.FileDescriptor const file_hostagent_proto_rawDesc = "" + @@ -2029,8 +2298,28 @@ const file_hostagent_proto_rawDesc = "" + "sandbox_id\x18\x01 \x01(\tR\tsandboxId\"\x15\n" + "\x13PingSandboxResponse\"\x12\n" + "\x10TerminateRequest\"\x13\n" + - "\x11TerminateResponse2\x9c\n" + + "\x11TerminateResponse\"\x89\x01\n" + + "\vMetricPoint\x12%\n" + + "\x0etimestamp_unix\x18\x01 \x01(\x03R\rtimestampUnix\x12\x17\n" + + "\acpu_pct\x18\x02 \x01(\x01R\x06cpuPct\x12\x1b\n" + + "\tmem_bytes\x18\x03 \x01(\x03R\bmemBytes\x12\x1d\n" + "\n" + + "disk_bytes\x18\x04 \x01(\x03R\tdiskBytes\"O\n" + + "\x18GetSandboxMetricsRequest\x12\x1d\n" + + "\n" + + "sandbox_id\x18\x01 \x01(\tR\tsandboxId\x12\x14\n" + + "\x05range\x18\x02 \x01(\tR\x05range\"N\n" + + "\x19GetSandboxMetricsResponse\x121\n" + + "\x06points\x18\x01 \x03(\v2\x19.hostagent.v1.MetricPointR\x06points\";\n" + + "\x1aFlushSandboxMetricsRequest\x12\x1d\n" + + "\n" + + "sandbox_id\x18\x01 \x01(\tR\tsandboxId\"\xc9\x01\n" + + "\x1bFlushSandboxMetricsResponse\x128\n" + + "\n" + + "points_10m\x18\x01 \x03(\v2\x19.hostagent.v1.MetricPointR\tpoints10m\x126\n" + + "\tpoints_2h\x18\x02 \x03(\v2\x19.hostagent.v1.MetricPointR\bpoints2h\x128\n" + + "\n" + + "points_24h\x18\x03 \x03(\v2\x19.hostagent.v1.MetricPointR\tpoints24h2\xee\v\n" + "\x10HostAgentService\x12X\n" + "\rCreateSandbox\x12\".hostagent.v1.CreateSandboxRequest\x1a#.hostagent.v1.CreateSandboxResponse\x12[\n" + "\x0eDestroySandbox\x12#.hostagent.v1.DestroySandboxRequest\x1a$.hostagent.v1.DestroySandboxResponse\x12U\n" + @@ -2047,7 +2336,9 @@ const file_hostagent_proto_rawDesc = "" + "\x0fWriteFileStream\x12$.hostagent.v1.WriteFileStreamRequest\x1a%.hostagent.v1.WriteFileStreamResponse(\x01\x12]\n" + "\x0eReadFileStream\x12#.hostagent.v1.ReadFileStreamRequest\x1a$.hostagent.v1.ReadFileStreamResponse0\x01\x12R\n" + "\vPingSandbox\x12 .hostagent.v1.PingSandboxRequest\x1a!.hostagent.v1.PingSandboxResponse\x12L\n" + - "\tTerminate\x12\x1e.hostagent.v1.TerminateRequest\x1a\x1f.hostagent.v1.TerminateResponseB\xb0\x01\n" + + "\tTerminate\x12\x1e.hostagent.v1.TerminateRequest\x1a\x1f.hostagent.v1.TerminateResponse\x12d\n" + + "\x11GetSandboxMetrics\x12&.hostagent.v1.GetSandboxMetricsRequest\x1a'.hostagent.v1.GetSandboxMetricsResponse\x12j\n" + + "\x13FlushSandboxMetrics\x12(.hostagent.v1.FlushSandboxMetricsRequest\x1a).hostagent.v1.FlushSandboxMetricsResponseB\xb0\x01\n" + "\x10com.hostagent.v1B\x0eHostagentProtoP\x01Z;git.omukk.dev/wrenn/sandbox/proto/hostagent/gen;hostagentv1\xa2\x02\x03HXX\xaa\x02\fHostagent.V1\xca\x02\fHostagent\\V1\xe2\x02\x18Hostagent\\V1\\GPBMetadata\xea\x02\rHostagent::V1b\x06proto3" var ( @@ -2062,43 +2353,48 @@ func file_hostagent_proto_rawDescGZIP() []byte { return file_hostagent_proto_rawDescData } -var file_hostagent_proto_msgTypes = make([]protoimpl.MessageInfo, 35) +var file_hostagent_proto_msgTypes = make([]protoimpl.MessageInfo, 40) var file_hostagent_proto_goTypes = []any{ - (*CreateSandboxRequest)(nil), // 0: hostagent.v1.CreateSandboxRequest - (*CreateSandboxResponse)(nil), // 1: hostagent.v1.CreateSandboxResponse - (*DestroySandboxRequest)(nil), // 2: hostagent.v1.DestroySandboxRequest - (*DestroySandboxResponse)(nil), // 3: hostagent.v1.DestroySandboxResponse - (*PauseSandboxRequest)(nil), // 4: hostagent.v1.PauseSandboxRequest - (*PauseSandboxResponse)(nil), // 5: hostagent.v1.PauseSandboxResponse - (*ResumeSandboxRequest)(nil), // 6: hostagent.v1.ResumeSandboxRequest - (*ResumeSandboxResponse)(nil), // 7: hostagent.v1.ResumeSandboxResponse - (*CreateSnapshotRequest)(nil), // 8: hostagent.v1.CreateSnapshotRequest - (*CreateSnapshotResponse)(nil), // 9: hostagent.v1.CreateSnapshotResponse - (*DeleteSnapshotRequest)(nil), // 10: hostagent.v1.DeleteSnapshotRequest - (*DeleteSnapshotResponse)(nil), // 11: hostagent.v1.DeleteSnapshotResponse - (*ExecRequest)(nil), // 12: hostagent.v1.ExecRequest - (*ExecResponse)(nil), // 13: hostagent.v1.ExecResponse - (*ListSandboxesRequest)(nil), // 14: hostagent.v1.ListSandboxesRequest - (*ListSandboxesResponse)(nil), // 15: hostagent.v1.ListSandboxesResponse - (*SandboxInfo)(nil), // 16: hostagent.v1.SandboxInfo - (*WriteFileRequest)(nil), // 17: hostagent.v1.WriteFileRequest - (*WriteFileResponse)(nil), // 18: hostagent.v1.WriteFileResponse - (*ReadFileRequest)(nil), // 19: hostagent.v1.ReadFileRequest - (*ReadFileResponse)(nil), // 20: hostagent.v1.ReadFileResponse - (*ExecStreamRequest)(nil), // 21: hostagent.v1.ExecStreamRequest - (*ExecStreamResponse)(nil), // 22: hostagent.v1.ExecStreamResponse - (*ExecStreamStart)(nil), // 23: hostagent.v1.ExecStreamStart - (*ExecStreamData)(nil), // 24: hostagent.v1.ExecStreamData - (*ExecStreamEnd)(nil), // 25: hostagent.v1.ExecStreamEnd - (*WriteFileStreamRequest)(nil), // 26: hostagent.v1.WriteFileStreamRequest - (*WriteFileStreamMeta)(nil), // 27: hostagent.v1.WriteFileStreamMeta - (*WriteFileStreamResponse)(nil), // 28: hostagent.v1.WriteFileStreamResponse - (*ReadFileStreamRequest)(nil), // 29: hostagent.v1.ReadFileStreamRequest - (*ReadFileStreamResponse)(nil), // 30: hostagent.v1.ReadFileStreamResponse - (*PingSandboxRequest)(nil), // 31: hostagent.v1.PingSandboxRequest - (*PingSandboxResponse)(nil), // 32: hostagent.v1.PingSandboxResponse - (*TerminateRequest)(nil), // 33: hostagent.v1.TerminateRequest - (*TerminateResponse)(nil), // 34: hostagent.v1.TerminateResponse + (*CreateSandboxRequest)(nil), // 0: hostagent.v1.CreateSandboxRequest + (*CreateSandboxResponse)(nil), // 1: hostagent.v1.CreateSandboxResponse + (*DestroySandboxRequest)(nil), // 2: hostagent.v1.DestroySandboxRequest + (*DestroySandboxResponse)(nil), // 3: hostagent.v1.DestroySandboxResponse + (*PauseSandboxRequest)(nil), // 4: hostagent.v1.PauseSandboxRequest + (*PauseSandboxResponse)(nil), // 5: hostagent.v1.PauseSandboxResponse + (*ResumeSandboxRequest)(nil), // 6: hostagent.v1.ResumeSandboxRequest + (*ResumeSandboxResponse)(nil), // 7: hostagent.v1.ResumeSandboxResponse + (*CreateSnapshotRequest)(nil), // 8: hostagent.v1.CreateSnapshotRequest + (*CreateSnapshotResponse)(nil), // 9: hostagent.v1.CreateSnapshotResponse + (*DeleteSnapshotRequest)(nil), // 10: hostagent.v1.DeleteSnapshotRequest + (*DeleteSnapshotResponse)(nil), // 11: hostagent.v1.DeleteSnapshotResponse + (*ExecRequest)(nil), // 12: hostagent.v1.ExecRequest + (*ExecResponse)(nil), // 13: hostagent.v1.ExecResponse + (*ListSandboxesRequest)(nil), // 14: hostagent.v1.ListSandboxesRequest + (*ListSandboxesResponse)(nil), // 15: hostagent.v1.ListSandboxesResponse + (*SandboxInfo)(nil), // 16: hostagent.v1.SandboxInfo + (*WriteFileRequest)(nil), // 17: hostagent.v1.WriteFileRequest + (*WriteFileResponse)(nil), // 18: hostagent.v1.WriteFileResponse + (*ReadFileRequest)(nil), // 19: hostagent.v1.ReadFileRequest + (*ReadFileResponse)(nil), // 20: hostagent.v1.ReadFileResponse + (*ExecStreamRequest)(nil), // 21: hostagent.v1.ExecStreamRequest + (*ExecStreamResponse)(nil), // 22: hostagent.v1.ExecStreamResponse + (*ExecStreamStart)(nil), // 23: hostagent.v1.ExecStreamStart + (*ExecStreamData)(nil), // 24: hostagent.v1.ExecStreamData + (*ExecStreamEnd)(nil), // 25: hostagent.v1.ExecStreamEnd + (*WriteFileStreamRequest)(nil), // 26: hostagent.v1.WriteFileStreamRequest + (*WriteFileStreamMeta)(nil), // 27: hostagent.v1.WriteFileStreamMeta + (*WriteFileStreamResponse)(nil), // 28: hostagent.v1.WriteFileStreamResponse + (*ReadFileStreamRequest)(nil), // 29: hostagent.v1.ReadFileStreamRequest + (*ReadFileStreamResponse)(nil), // 30: hostagent.v1.ReadFileStreamResponse + (*PingSandboxRequest)(nil), // 31: hostagent.v1.PingSandboxRequest + (*PingSandboxResponse)(nil), // 32: hostagent.v1.PingSandboxResponse + (*TerminateRequest)(nil), // 33: hostagent.v1.TerminateRequest + (*TerminateResponse)(nil), // 34: hostagent.v1.TerminateResponse + (*MetricPoint)(nil), // 35: hostagent.v1.MetricPoint + (*GetSandboxMetricsRequest)(nil), // 36: hostagent.v1.GetSandboxMetricsRequest + (*GetSandboxMetricsResponse)(nil), // 37: hostagent.v1.GetSandboxMetricsResponse + (*FlushSandboxMetricsRequest)(nil), // 38: hostagent.v1.FlushSandboxMetricsRequest + (*FlushSandboxMetricsResponse)(nil), // 39: hostagent.v1.FlushSandboxMetricsResponse } var file_hostagent_proto_depIdxs = []int32{ 16, // 0: hostagent.v1.ListSandboxesResponse.sandboxes:type_name -> hostagent.v1.SandboxInfo @@ -2106,41 +2402,49 @@ var file_hostagent_proto_depIdxs = []int32{ 24, // 2: hostagent.v1.ExecStreamResponse.data:type_name -> hostagent.v1.ExecStreamData 25, // 3: hostagent.v1.ExecStreamResponse.end:type_name -> hostagent.v1.ExecStreamEnd 27, // 4: hostagent.v1.WriteFileStreamRequest.meta:type_name -> hostagent.v1.WriteFileStreamMeta - 0, // 5: hostagent.v1.HostAgentService.CreateSandbox:input_type -> hostagent.v1.CreateSandboxRequest - 2, // 6: hostagent.v1.HostAgentService.DestroySandbox:input_type -> hostagent.v1.DestroySandboxRequest - 4, // 7: hostagent.v1.HostAgentService.PauseSandbox:input_type -> hostagent.v1.PauseSandboxRequest - 6, // 8: hostagent.v1.HostAgentService.ResumeSandbox:input_type -> hostagent.v1.ResumeSandboxRequest - 12, // 9: hostagent.v1.HostAgentService.Exec:input_type -> hostagent.v1.ExecRequest - 14, // 10: hostagent.v1.HostAgentService.ListSandboxes:input_type -> hostagent.v1.ListSandboxesRequest - 17, // 11: hostagent.v1.HostAgentService.WriteFile:input_type -> hostagent.v1.WriteFileRequest - 19, // 12: hostagent.v1.HostAgentService.ReadFile:input_type -> hostagent.v1.ReadFileRequest - 8, // 13: hostagent.v1.HostAgentService.CreateSnapshot:input_type -> hostagent.v1.CreateSnapshotRequest - 10, // 14: hostagent.v1.HostAgentService.DeleteSnapshot:input_type -> hostagent.v1.DeleteSnapshotRequest - 21, // 15: hostagent.v1.HostAgentService.ExecStream:input_type -> hostagent.v1.ExecStreamRequest - 26, // 16: hostagent.v1.HostAgentService.WriteFileStream:input_type -> hostagent.v1.WriteFileStreamRequest - 29, // 17: hostagent.v1.HostAgentService.ReadFileStream:input_type -> hostagent.v1.ReadFileStreamRequest - 31, // 18: hostagent.v1.HostAgentService.PingSandbox:input_type -> hostagent.v1.PingSandboxRequest - 33, // 19: hostagent.v1.HostAgentService.Terminate:input_type -> hostagent.v1.TerminateRequest - 1, // 20: hostagent.v1.HostAgentService.CreateSandbox:output_type -> hostagent.v1.CreateSandboxResponse - 3, // 21: hostagent.v1.HostAgentService.DestroySandbox:output_type -> hostagent.v1.DestroySandboxResponse - 5, // 22: hostagent.v1.HostAgentService.PauseSandbox:output_type -> hostagent.v1.PauseSandboxResponse - 7, // 23: hostagent.v1.HostAgentService.ResumeSandbox:output_type -> hostagent.v1.ResumeSandboxResponse - 13, // 24: hostagent.v1.HostAgentService.Exec:output_type -> hostagent.v1.ExecResponse - 15, // 25: hostagent.v1.HostAgentService.ListSandboxes:output_type -> hostagent.v1.ListSandboxesResponse - 18, // 26: hostagent.v1.HostAgentService.WriteFile:output_type -> hostagent.v1.WriteFileResponse - 20, // 27: hostagent.v1.HostAgentService.ReadFile:output_type -> hostagent.v1.ReadFileResponse - 9, // 28: hostagent.v1.HostAgentService.CreateSnapshot:output_type -> hostagent.v1.CreateSnapshotResponse - 11, // 29: hostagent.v1.HostAgentService.DeleteSnapshot:output_type -> hostagent.v1.DeleteSnapshotResponse - 22, // 30: hostagent.v1.HostAgentService.ExecStream:output_type -> hostagent.v1.ExecStreamResponse - 28, // 31: hostagent.v1.HostAgentService.WriteFileStream:output_type -> hostagent.v1.WriteFileStreamResponse - 30, // 32: hostagent.v1.HostAgentService.ReadFileStream:output_type -> hostagent.v1.ReadFileStreamResponse - 32, // 33: hostagent.v1.HostAgentService.PingSandbox:output_type -> hostagent.v1.PingSandboxResponse - 34, // 34: hostagent.v1.HostAgentService.Terminate:output_type -> hostagent.v1.TerminateResponse - 20, // [20:35] is the sub-list for method output_type - 5, // [5:20] is the sub-list for method input_type - 5, // [5:5] is the sub-list for extension type_name - 5, // [5:5] is the sub-list for extension extendee - 0, // [0:5] is the sub-list for field type_name + 35, // 5: hostagent.v1.GetSandboxMetricsResponse.points:type_name -> hostagent.v1.MetricPoint + 35, // 6: hostagent.v1.FlushSandboxMetricsResponse.points_10m:type_name -> hostagent.v1.MetricPoint + 35, // 7: hostagent.v1.FlushSandboxMetricsResponse.points_2h:type_name -> hostagent.v1.MetricPoint + 35, // 8: hostagent.v1.FlushSandboxMetricsResponse.points_24h:type_name -> hostagent.v1.MetricPoint + 0, // 9: hostagent.v1.HostAgentService.CreateSandbox:input_type -> hostagent.v1.CreateSandboxRequest + 2, // 10: hostagent.v1.HostAgentService.DestroySandbox:input_type -> hostagent.v1.DestroySandboxRequest + 4, // 11: hostagent.v1.HostAgentService.PauseSandbox:input_type -> hostagent.v1.PauseSandboxRequest + 6, // 12: hostagent.v1.HostAgentService.ResumeSandbox:input_type -> hostagent.v1.ResumeSandboxRequest + 12, // 13: hostagent.v1.HostAgentService.Exec:input_type -> hostagent.v1.ExecRequest + 14, // 14: hostagent.v1.HostAgentService.ListSandboxes:input_type -> hostagent.v1.ListSandboxesRequest + 17, // 15: hostagent.v1.HostAgentService.WriteFile:input_type -> hostagent.v1.WriteFileRequest + 19, // 16: hostagent.v1.HostAgentService.ReadFile:input_type -> hostagent.v1.ReadFileRequest + 8, // 17: hostagent.v1.HostAgentService.CreateSnapshot:input_type -> hostagent.v1.CreateSnapshotRequest + 10, // 18: hostagent.v1.HostAgentService.DeleteSnapshot:input_type -> hostagent.v1.DeleteSnapshotRequest + 21, // 19: hostagent.v1.HostAgentService.ExecStream:input_type -> hostagent.v1.ExecStreamRequest + 26, // 20: hostagent.v1.HostAgentService.WriteFileStream:input_type -> hostagent.v1.WriteFileStreamRequest + 29, // 21: hostagent.v1.HostAgentService.ReadFileStream:input_type -> hostagent.v1.ReadFileStreamRequest + 31, // 22: hostagent.v1.HostAgentService.PingSandbox:input_type -> hostagent.v1.PingSandboxRequest + 33, // 23: hostagent.v1.HostAgentService.Terminate:input_type -> hostagent.v1.TerminateRequest + 36, // 24: hostagent.v1.HostAgentService.GetSandboxMetrics:input_type -> hostagent.v1.GetSandboxMetricsRequest + 38, // 25: hostagent.v1.HostAgentService.FlushSandboxMetrics:input_type -> hostagent.v1.FlushSandboxMetricsRequest + 1, // 26: hostagent.v1.HostAgentService.CreateSandbox:output_type -> hostagent.v1.CreateSandboxResponse + 3, // 27: hostagent.v1.HostAgentService.DestroySandbox:output_type -> hostagent.v1.DestroySandboxResponse + 5, // 28: hostagent.v1.HostAgentService.PauseSandbox:output_type -> hostagent.v1.PauseSandboxResponse + 7, // 29: hostagent.v1.HostAgentService.ResumeSandbox:output_type -> hostagent.v1.ResumeSandboxResponse + 13, // 30: hostagent.v1.HostAgentService.Exec:output_type -> hostagent.v1.ExecResponse + 15, // 31: hostagent.v1.HostAgentService.ListSandboxes:output_type -> hostagent.v1.ListSandboxesResponse + 18, // 32: hostagent.v1.HostAgentService.WriteFile:output_type -> hostagent.v1.WriteFileResponse + 20, // 33: hostagent.v1.HostAgentService.ReadFile:output_type -> hostagent.v1.ReadFileResponse + 9, // 34: hostagent.v1.HostAgentService.CreateSnapshot:output_type -> hostagent.v1.CreateSnapshotResponse + 11, // 35: hostagent.v1.HostAgentService.DeleteSnapshot:output_type -> hostagent.v1.DeleteSnapshotResponse + 22, // 36: hostagent.v1.HostAgentService.ExecStream:output_type -> hostagent.v1.ExecStreamResponse + 28, // 37: hostagent.v1.HostAgentService.WriteFileStream:output_type -> hostagent.v1.WriteFileStreamResponse + 30, // 38: hostagent.v1.HostAgentService.ReadFileStream:output_type -> hostagent.v1.ReadFileStreamResponse + 32, // 39: hostagent.v1.HostAgentService.PingSandbox:output_type -> hostagent.v1.PingSandboxResponse + 34, // 40: hostagent.v1.HostAgentService.Terminate:output_type -> hostagent.v1.TerminateResponse + 37, // 41: hostagent.v1.HostAgentService.GetSandboxMetrics:output_type -> hostagent.v1.GetSandboxMetricsResponse + 39, // 42: hostagent.v1.HostAgentService.FlushSandboxMetrics:output_type -> hostagent.v1.FlushSandboxMetricsResponse + 26, // [26:43] is the sub-list for method output_type + 9, // [9:26] is the sub-list for method input_type + 9, // [9:9] is the sub-list for extension type_name + 9, // [9:9] is the sub-list for extension extendee + 0, // [0:9] is the sub-list for field type_name } func init() { file_hostagent_proto_init() } @@ -2167,7 +2471,7 @@ func file_hostagent_proto_init() { GoPackagePath: reflect.TypeOf(x{}).PkgPath(), RawDescriptor: unsafe.Slice(unsafe.StringData(file_hostagent_proto_rawDesc), len(file_hostagent_proto_rawDesc)), NumEnums: 0, - NumMessages: 35, + NumMessages: 40, NumExtensions: 0, NumServices: 1, }, diff --git a/proto/hostagent/gen/hostagentv1connect/hostagent.connect.go b/proto/hostagent/gen/hostagentv1connect/hostagent.connect.go index d144451..7f0fa70 100644 --- a/proto/hostagent/gen/hostagentv1connect/hostagent.connect.go +++ b/proto/hostagent/gen/hostagentv1connect/hostagent.connect.go @@ -77,6 +77,12 @@ const ( // HostAgentServiceTerminateProcedure is the fully-qualified name of the HostAgentService's // Terminate RPC. HostAgentServiceTerminateProcedure = "/hostagent.v1.HostAgentService/Terminate" + // HostAgentServiceGetSandboxMetricsProcedure is the fully-qualified name of the HostAgentService's + // GetSandboxMetrics RPC. + HostAgentServiceGetSandboxMetricsProcedure = "/hostagent.v1.HostAgentService/GetSandboxMetrics" + // HostAgentServiceFlushSandboxMetricsProcedure is the fully-qualified name of the + // HostAgentService's FlushSandboxMetrics RPC. + HostAgentServiceFlushSandboxMetricsProcedure = "/hostagent.v1.HostAgentService/FlushSandboxMetrics" ) // HostAgentServiceClient is a client for the hostagent.v1.HostAgentService service. @@ -115,6 +121,11 @@ type HostAgentServiceClient interface { // Called by the control plane immediately when a host is deleted so the // agent shuts down without waiting for the next heartbeat cycle. Terminate(context.Context, *connect.Request[gen.TerminateRequest]) (*connect.Response[gen.TerminateResponse], error) + // GetSandboxMetrics returns ring buffer metrics for a running sandbox. + GetSandboxMetrics(context.Context, *connect.Request[gen.GetSandboxMetricsRequest]) (*connect.Response[gen.GetSandboxMetricsResponse], error) + // FlushSandboxMetrics returns all ring buffer tiers and clears them. + // Called by the control plane before pause/destroy to persist metrics to DB. + FlushSandboxMetrics(context.Context, *connect.Request[gen.FlushSandboxMetricsRequest]) (*connect.Response[gen.FlushSandboxMetricsResponse], error) } // NewHostAgentServiceClient constructs a client for the hostagent.v1.HostAgentService service. By @@ -218,26 +229,40 @@ func NewHostAgentServiceClient(httpClient connect.HTTPClient, baseURL string, op connect.WithSchema(hostAgentServiceMethods.ByName("Terminate")), connect.WithClientOptions(opts...), ), + getSandboxMetrics: connect.NewClient[gen.GetSandboxMetricsRequest, gen.GetSandboxMetricsResponse]( + httpClient, + baseURL+HostAgentServiceGetSandboxMetricsProcedure, + connect.WithSchema(hostAgentServiceMethods.ByName("GetSandboxMetrics")), + connect.WithClientOptions(opts...), + ), + flushSandboxMetrics: connect.NewClient[gen.FlushSandboxMetricsRequest, gen.FlushSandboxMetricsResponse]( + httpClient, + baseURL+HostAgentServiceFlushSandboxMetricsProcedure, + connect.WithSchema(hostAgentServiceMethods.ByName("FlushSandboxMetrics")), + connect.WithClientOptions(opts...), + ), } } // hostAgentServiceClient implements HostAgentServiceClient. type hostAgentServiceClient struct { - createSandbox *connect.Client[gen.CreateSandboxRequest, gen.CreateSandboxResponse] - destroySandbox *connect.Client[gen.DestroySandboxRequest, gen.DestroySandboxResponse] - pauseSandbox *connect.Client[gen.PauseSandboxRequest, gen.PauseSandboxResponse] - resumeSandbox *connect.Client[gen.ResumeSandboxRequest, gen.ResumeSandboxResponse] - exec *connect.Client[gen.ExecRequest, gen.ExecResponse] - listSandboxes *connect.Client[gen.ListSandboxesRequest, gen.ListSandboxesResponse] - writeFile *connect.Client[gen.WriteFileRequest, gen.WriteFileResponse] - readFile *connect.Client[gen.ReadFileRequest, gen.ReadFileResponse] - createSnapshot *connect.Client[gen.CreateSnapshotRequest, gen.CreateSnapshotResponse] - deleteSnapshot *connect.Client[gen.DeleteSnapshotRequest, gen.DeleteSnapshotResponse] - execStream *connect.Client[gen.ExecStreamRequest, gen.ExecStreamResponse] - writeFileStream *connect.Client[gen.WriteFileStreamRequest, gen.WriteFileStreamResponse] - readFileStream *connect.Client[gen.ReadFileStreamRequest, gen.ReadFileStreamResponse] - pingSandbox *connect.Client[gen.PingSandboxRequest, gen.PingSandboxResponse] - terminate *connect.Client[gen.TerminateRequest, gen.TerminateResponse] + createSandbox *connect.Client[gen.CreateSandboxRequest, gen.CreateSandboxResponse] + destroySandbox *connect.Client[gen.DestroySandboxRequest, gen.DestroySandboxResponse] + pauseSandbox *connect.Client[gen.PauseSandboxRequest, gen.PauseSandboxResponse] + resumeSandbox *connect.Client[gen.ResumeSandboxRequest, gen.ResumeSandboxResponse] + exec *connect.Client[gen.ExecRequest, gen.ExecResponse] + listSandboxes *connect.Client[gen.ListSandboxesRequest, gen.ListSandboxesResponse] + writeFile *connect.Client[gen.WriteFileRequest, gen.WriteFileResponse] + readFile *connect.Client[gen.ReadFileRequest, gen.ReadFileResponse] + createSnapshot *connect.Client[gen.CreateSnapshotRequest, gen.CreateSnapshotResponse] + deleteSnapshot *connect.Client[gen.DeleteSnapshotRequest, gen.DeleteSnapshotResponse] + execStream *connect.Client[gen.ExecStreamRequest, gen.ExecStreamResponse] + writeFileStream *connect.Client[gen.WriteFileStreamRequest, gen.WriteFileStreamResponse] + readFileStream *connect.Client[gen.ReadFileStreamRequest, gen.ReadFileStreamResponse] + pingSandbox *connect.Client[gen.PingSandboxRequest, gen.PingSandboxResponse] + terminate *connect.Client[gen.TerminateRequest, gen.TerminateResponse] + getSandboxMetrics *connect.Client[gen.GetSandboxMetricsRequest, gen.GetSandboxMetricsResponse] + flushSandboxMetrics *connect.Client[gen.FlushSandboxMetricsRequest, gen.FlushSandboxMetricsResponse] } // CreateSandbox calls hostagent.v1.HostAgentService.CreateSandbox. @@ -315,6 +340,16 @@ func (c *hostAgentServiceClient) Terminate(ctx context.Context, req *connect.Req return c.terminate.CallUnary(ctx, req) } +// GetSandboxMetrics calls hostagent.v1.HostAgentService.GetSandboxMetrics. +func (c *hostAgentServiceClient) GetSandboxMetrics(ctx context.Context, req *connect.Request[gen.GetSandboxMetricsRequest]) (*connect.Response[gen.GetSandboxMetricsResponse], error) { + return c.getSandboxMetrics.CallUnary(ctx, req) +} + +// FlushSandboxMetrics calls hostagent.v1.HostAgentService.FlushSandboxMetrics. +func (c *hostAgentServiceClient) FlushSandboxMetrics(ctx context.Context, req *connect.Request[gen.FlushSandboxMetricsRequest]) (*connect.Response[gen.FlushSandboxMetricsResponse], error) { + return c.flushSandboxMetrics.CallUnary(ctx, req) +} + // HostAgentServiceHandler is an implementation of the hostagent.v1.HostAgentService service. type HostAgentServiceHandler interface { // CreateSandbox boots a new microVM with the given configuration. @@ -351,6 +386,11 @@ type HostAgentServiceHandler interface { // Called by the control plane immediately when a host is deleted so the // agent shuts down without waiting for the next heartbeat cycle. Terminate(context.Context, *connect.Request[gen.TerminateRequest]) (*connect.Response[gen.TerminateResponse], error) + // GetSandboxMetrics returns ring buffer metrics for a running sandbox. + GetSandboxMetrics(context.Context, *connect.Request[gen.GetSandboxMetricsRequest]) (*connect.Response[gen.GetSandboxMetricsResponse], error) + // FlushSandboxMetrics returns all ring buffer tiers and clears them. + // Called by the control plane before pause/destroy to persist metrics to DB. + FlushSandboxMetrics(context.Context, *connect.Request[gen.FlushSandboxMetricsRequest]) (*connect.Response[gen.FlushSandboxMetricsResponse], error) } // NewHostAgentServiceHandler builds an HTTP handler from the service implementation. It returns the @@ -450,6 +490,18 @@ func NewHostAgentServiceHandler(svc HostAgentServiceHandler, opts ...connect.Han connect.WithSchema(hostAgentServiceMethods.ByName("Terminate")), connect.WithHandlerOptions(opts...), ) + hostAgentServiceGetSandboxMetricsHandler := connect.NewUnaryHandler( + HostAgentServiceGetSandboxMetricsProcedure, + svc.GetSandboxMetrics, + connect.WithSchema(hostAgentServiceMethods.ByName("GetSandboxMetrics")), + connect.WithHandlerOptions(opts...), + ) + hostAgentServiceFlushSandboxMetricsHandler := connect.NewUnaryHandler( + HostAgentServiceFlushSandboxMetricsProcedure, + svc.FlushSandboxMetrics, + connect.WithSchema(hostAgentServiceMethods.ByName("FlushSandboxMetrics")), + connect.WithHandlerOptions(opts...), + ) return "/hostagent.v1.HostAgentService/", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { switch r.URL.Path { case HostAgentServiceCreateSandboxProcedure: @@ -482,6 +534,10 @@ func NewHostAgentServiceHandler(svc HostAgentServiceHandler, opts ...connect.Han hostAgentServicePingSandboxHandler.ServeHTTP(w, r) case HostAgentServiceTerminateProcedure: hostAgentServiceTerminateHandler.ServeHTTP(w, r) + case HostAgentServiceGetSandboxMetricsProcedure: + hostAgentServiceGetSandboxMetricsHandler.ServeHTTP(w, r) + case HostAgentServiceFlushSandboxMetricsProcedure: + hostAgentServiceFlushSandboxMetricsHandler.ServeHTTP(w, r) default: http.NotFound(w, r) } @@ -550,3 +606,11 @@ func (UnimplementedHostAgentServiceHandler) PingSandbox(context.Context, *connec func (UnimplementedHostAgentServiceHandler) Terminate(context.Context, *connect.Request[gen.TerminateRequest]) (*connect.Response[gen.TerminateResponse], error) { return nil, connect.NewError(connect.CodeUnimplemented, errors.New("hostagent.v1.HostAgentService.Terminate is not implemented")) } + +func (UnimplementedHostAgentServiceHandler) GetSandboxMetrics(context.Context, *connect.Request[gen.GetSandboxMetricsRequest]) (*connect.Response[gen.GetSandboxMetricsResponse], error) { + return nil, connect.NewError(connect.CodeUnimplemented, errors.New("hostagent.v1.HostAgentService.GetSandboxMetrics is not implemented")) +} + +func (UnimplementedHostAgentServiceHandler) FlushSandboxMetrics(context.Context, *connect.Request[gen.FlushSandboxMetricsRequest]) (*connect.Response[gen.FlushSandboxMetricsResponse], error) { + return nil, connect.NewError(connect.CodeUnimplemented, errors.New("hostagent.v1.HostAgentService.FlushSandboxMetrics is not implemented")) +} diff --git a/proto/hostagent/hostagent.proto b/proto/hostagent/hostagent.proto index c9cfffa..214a84e 100644 --- a/proto/hostagent/hostagent.proto +++ b/proto/hostagent/hostagent.proto @@ -54,6 +54,13 @@ service HostAgentService { // agent shuts down without waiting for the next heartbeat cycle. rpc Terminate(TerminateRequest) returns (TerminateResponse); + // GetSandboxMetrics returns ring buffer metrics for a running sandbox. + rpc GetSandboxMetrics(GetSandboxMetricsRequest) returns (GetSandboxMetricsResponse); + + // FlushSandboxMetrics returns all ring buffer tiers and clears them. + // Called by the control plane before pause/destroy to persist metrics to DB. + rpc FlushSandboxMetrics(FlushSandboxMetricsRequest) returns (FlushSandboxMetricsResponse); + } message CreateSandboxRequest { @@ -248,3 +255,32 @@ message PingSandboxResponse {} message TerminateRequest {} message TerminateResponse {} + +// ── Metrics ────────────────────────────────────────────────────────── + +message MetricPoint { + int64 timestamp_unix = 1; + double cpu_pct = 2; + int64 mem_bytes = 3; + int64 disk_bytes = 4; +} + +message GetSandboxMetricsRequest { + string sandbox_id = 1; + // Range tier: "10m", "2h", or "24h". + string range = 2; +} + +message GetSandboxMetricsResponse { + repeated MetricPoint points = 1; +} + +message FlushSandboxMetricsRequest { + string sandbox_id = 1; +} + +message FlushSandboxMetricsResponse { + repeated MetricPoint points_10m = 1; + repeated MetricPoint points_2h = 2; + repeated MetricPoint points_24h = 3; +}