forked from wrenn/wrenn
Add per-sandbox CPU/memory/disk metrics collection
Samples /proc/{fc_pid}/stat (CPU%), /proc/{fc_pid}/status (VmRSS), and
stat() on CoW files at 500ms intervals per running sandbox. Three tiered
ring buffers downsample into 30s and 5min averages for 10min/2h/24h
retention. Metrics are flushed to DB on pause (all tiers) and destroy
(24h only). New GetSandboxMetrics and FlushSandboxMetrics RPCs on the
host agent, proxied through GET /v1/sandboxes/{id}/metrics?range= on
the control plane. Returns live data for running sandboxes, DB data for
paused, and 404 for stopped.
This commit is contained in:
16
db/migrations/20260325135035_add_sandbox_metric_points.sql
Normal file
16
db/migrations/20260325135035_add_sandbox_metric_points.sql
Normal file
@ -0,0 +1,16 @@
|
||||
-- +goose Up
|
||||
CREATE TABLE sandbox_metric_points (
|
||||
sandbox_id TEXT NOT NULL,
|
||||
tier TEXT NOT NULL CHECK (tier IN ('10m', '2h', '24h')),
|
||||
ts BIGINT NOT NULL,
|
||||
cpu_pct FLOAT8 NOT NULL DEFAULT 0,
|
||||
mem_bytes BIGINT NOT NULL DEFAULT 0,
|
||||
disk_bytes BIGINT NOT NULL DEFAULT 0,
|
||||
PRIMARY KEY (sandbox_id, tier, ts)
|
||||
);
|
||||
|
||||
CREATE INDEX idx_sandbox_metric_points_sandbox_tier
|
||||
ON sandbox_metric_points (sandbox_id, tier);
|
||||
|
||||
-- +goose Down
|
||||
DROP TABLE IF EXISTS sandbox_metric_points;
|
||||
@ -27,6 +27,30 @@ WHERE team_id = $1
|
||||
DELETE FROM sandbox_metrics_snapshots
|
||||
WHERE sampled_at < NOW() - INTERVAL '60 days';
|
||||
|
||||
-- name: InsertSandboxMetricPoint :exec
|
||||
INSERT INTO sandbox_metric_points (sandbox_id, tier, ts, cpu_pct, mem_bytes, disk_bytes)
|
||||
VALUES ($1, $2, $3, $4, $5, $6)
|
||||
ON CONFLICT (sandbox_id, tier, ts) DO NOTHING;
|
||||
|
||||
-- name: GetSandboxMetricPoints :many
|
||||
SELECT ts, cpu_pct, mem_bytes, disk_bytes
|
||||
FROM sandbox_metric_points
|
||||
WHERE sandbox_id = $1 AND tier = $2
|
||||
ORDER BY ts ASC;
|
||||
|
||||
-- name: DeleteSandboxMetricPoints :exec
|
||||
DELETE FROM sandbox_metric_points
|
||||
WHERE sandbox_id = $1;
|
||||
|
||||
-- name: DeleteSandboxMetricPointsByTier :exec
|
||||
DELETE FROM sandbox_metric_points
|
||||
WHERE sandbox_id = $1 AND tier = $2;
|
||||
|
||||
-- name: PruneSandboxMetricPoints :exec
|
||||
-- Remove metric points older than 30 days for destroyed sandboxes.
|
||||
DELETE FROM sandbox_metric_points
|
||||
WHERE ts < EXTRACT(EPOCH FROM NOW() - INTERVAL '30 days')::BIGINT;
|
||||
|
||||
-- name: SampleSandboxMetrics :many
|
||||
-- Aggregates per-team resource usage from the live sandboxes table.
|
||||
-- Groups by all teams that have any sandbox row (including stopped) so that
|
||||
|
||||
Reference in New Issue
Block a user