1
0
forked from wrenn/wrenn

Add per-sandbox CPU/memory/disk metrics collection

Samples /proc/{fc_pid}/stat (CPU%), /proc/{fc_pid}/status (VmRSS), and
stat() on CoW files at 500ms intervals per running sandbox. Three tiered
ring buffers downsample into 30s and 5min averages for 10min/2h/24h
retention. Metrics are flushed to DB on pause (all tiers) and destroy
(24h only). New GetSandboxMetrics and FlushSandboxMetrics RPCs on the
host agent, proxied through GET /v1/sandboxes/{id}/metrics?range= on
the control plane. Returns live data for running sandboxes, DB data for
paused, and 404 for stopped.
This commit is contained in:
2026-03-25 20:10:33 +06:00
parent 7473c15f52
commit 9acdbb5ae9
16 changed files with 1430 additions and 90 deletions

View File

@ -0,0 +1,16 @@
-- +goose Up
CREATE TABLE sandbox_metric_points (
sandbox_id TEXT NOT NULL,
tier TEXT NOT NULL CHECK (tier IN ('10m', '2h', '24h')),
ts BIGINT NOT NULL,
cpu_pct FLOAT8 NOT NULL DEFAULT 0,
mem_bytes BIGINT NOT NULL DEFAULT 0,
disk_bytes BIGINT NOT NULL DEFAULT 0,
PRIMARY KEY (sandbox_id, tier, ts)
);
CREATE INDEX idx_sandbox_metric_points_sandbox_tier
ON sandbox_metric_points (sandbox_id, tier);
-- +goose Down
DROP TABLE IF EXISTS sandbox_metric_points;

View File

@ -27,6 +27,30 @@ WHERE team_id = $1
DELETE FROM sandbox_metrics_snapshots
WHERE sampled_at < NOW() - INTERVAL '60 days';
-- name: InsertSandboxMetricPoint :exec
INSERT INTO sandbox_metric_points (sandbox_id, tier, ts, cpu_pct, mem_bytes, disk_bytes)
VALUES ($1, $2, $3, $4, $5, $6)
ON CONFLICT (sandbox_id, tier, ts) DO NOTHING;
-- name: GetSandboxMetricPoints :many
SELECT ts, cpu_pct, mem_bytes, disk_bytes
FROM sandbox_metric_points
WHERE sandbox_id = $1 AND tier = $2
ORDER BY ts ASC;
-- name: DeleteSandboxMetricPoints :exec
DELETE FROM sandbox_metric_points
WHERE sandbox_id = $1;
-- name: DeleteSandboxMetricPointsByTier :exec
DELETE FROM sandbox_metric_points
WHERE sandbox_id = $1 AND tier = $2;
-- name: PruneSandboxMetricPoints :exec
-- Remove metric points older than 30 days for destroyed sandboxes.
DELETE FROM sandbox_metric_points
WHERE ts < EXTRACT(EPOCH FROM NOW() - INTERVAL '30 days')::BIGINT;
-- name: SampleSandboxMetrics :many
-- Aggregates per-team resource usage from the live sandboxes table.
-- Groups by all teams that have any sandbox row (including stopped) so that