1
0
forked from wrenn/wrenn

Fix metrics sampler to record zero-value snapshots when idle

SampleSandboxMetrics previously filtered WHERE status IN ('running',
'starting', 'paused'), which returned no rows when all capsules were
stopped. This caused zero snapshots to be skipped, leaving the
time-series charts with no trailing data points instead of showing
the expected zero values.

Remove the WHERE filter so the query groups by all teams that have
any sandbox row. The per-status FILTER clauses on the aggregates
already produce correct zero counts for stopped capsules.

Also includes the per-VM RAM ceiling formula change (sum(ceil(each/2))
instead of ceil(sum/2)).
This commit is contained in:
2026-03-25 15:50:19 +06:00
parent 930da8a578
commit e3750f79f9
2 changed files with 14 additions and 10 deletions

View File

@ -5,12 +5,12 @@ VALUES ($1, $2, $3, $4);
-- name: GetLiveMetrics :one -- name: GetLiveMetrics :one
-- Reads directly from sandboxes for accurate real-time current values. -- Reads directly from sandboxes for accurate real-time current values.
-- CPU reserved = running + starting only (paused VMs release CPU). -- CPU reserved = running + starting only (paused VMs release CPU).
-- RAM reserved = running + starting + ceil(paused/2) (capacity held for resume). -- RAM reserved = running + starting + sum(ceil(each_paused/2)) (per-VM ceiling).
SELECT SELECT
(COUNT(*) FILTER (WHERE status IN ('running', 'starting')))::INTEGER AS running_count, (COUNT(*) FILTER (WHERE status IN ('running', 'starting')))::INTEGER AS running_count,
(COALESCE(SUM(vcpus) FILTER (WHERE status IN ('running', 'starting')), 0))::INTEGER AS vcpus_reserved, (COALESCE(SUM(vcpus) FILTER (WHERE status IN ('running', 'starting')), 0))::INTEGER AS vcpus_reserved,
(COALESCE(SUM(memory_mb) FILTER (WHERE status IN ('running', 'starting')), 0) (COALESCE(SUM(memory_mb) FILTER (WHERE status IN ('running', 'starting')), 0)
+ CEIL(COALESCE(SUM(memory_mb) FILTER (WHERE status = 'paused'), 0)::NUMERIC / 2))::INTEGER AS memory_mb_reserved + COALESCE(SUM(CEIL(memory_mb::NUMERIC / 2)) FILTER (WHERE status = 'paused'), 0))::INTEGER AS memory_mb_reserved
FROM sandboxes FROM sandboxes
WHERE team_id = $1; WHERE team_id = $1;
@ -29,14 +29,16 @@ WHERE sampled_at < NOW() - INTERVAL '60 days';
-- name: SampleSandboxMetrics :many -- name: SampleSandboxMetrics :many
-- Aggregates per-team resource usage from the live sandboxes table. -- Aggregates per-team resource usage from the live sandboxes table.
-- Groups by all teams that have any sandbox row (including stopped) so that
-- zero-value snapshots are recorded when all capsules are stopped, keeping the
-- time-series charts continuous rather than trailing off into empty space.
-- CPU reserved = running + starting only (paused VMs release CPU). -- CPU reserved = running + starting only (paused VMs release CPU).
-- RAM reserved = running + starting + ceil(paused/2) (capacity held for resume). -- RAM reserved = running + starting + sum(ceil(each_paused/2)) (per-VM ceiling).
SELECT SELECT
team_id, team_id,
(COUNT(*) FILTER (WHERE status IN ('running', 'starting')))::INTEGER AS running_count, (COUNT(*) FILTER (WHERE status IN ('running', 'starting')))::INTEGER AS running_count,
(COALESCE(SUM(vcpus) FILTER (WHERE status IN ('running', 'starting')), 0))::INTEGER AS vcpus_reserved, (COALESCE(SUM(vcpus) FILTER (WHERE status IN ('running', 'starting')), 0))::INTEGER AS vcpus_reserved,
(COALESCE(SUM(memory_mb) FILTER (WHERE status IN ('running', 'starting')), 0) (COALESCE(SUM(memory_mb) FILTER (WHERE status IN ('running', 'starting')), 0)
+ CEIL(COALESCE(SUM(memory_mb) FILTER (WHERE status = 'paused'), 0)::NUMERIC / 2))::INTEGER AS memory_mb_reserved + COALESCE(SUM(CEIL(memory_mb::NUMERIC / 2)) FILTER (WHERE status = 'paused'), 0))::INTEGER AS memory_mb_reserved
FROM sandboxes FROM sandboxes
WHERE status IN ('running', 'starting', 'paused')
GROUP BY team_id; GROUP BY team_id;

View File

@ -14,7 +14,7 @@ SELECT
(COUNT(*) FILTER (WHERE status IN ('running', 'starting')))::INTEGER AS running_count, (COUNT(*) FILTER (WHERE status IN ('running', 'starting')))::INTEGER AS running_count,
(COALESCE(SUM(vcpus) FILTER (WHERE status IN ('running', 'starting')), 0))::INTEGER AS vcpus_reserved, (COALESCE(SUM(vcpus) FILTER (WHERE status IN ('running', 'starting')), 0))::INTEGER AS vcpus_reserved,
(COALESCE(SUM(memory_mb) FILTER (WHERE status IN ('running', 'starting')), 0) (COALESCE(SUM(memory_mb) FILTER (WHERE status IN ('running', 'starting')), 0)
+ CEIL(COALESCE(SUM(memory_mb) FILTER (WHERE status = 'paused'), 0)::NUMERIC / 2))::INTEGER AS memory_mb_reserved + COALESCE(SUM(CEIL(memory_mb::NUMERIC / 2)) FILTER (WHERE status = 'paused'), 0))::INTEGER AS memory_mb_reserved
FROM sandboxes FROM sandboxes
WHERE team_id = $1 WHERE team_id = $1
` `
@ -27,7 +27,7 @@ type GetLiveMetricsRow struct {
// Reads directly from sandboxes for accurate real-time current values. // Reads directly from sandboxes for accurate real-time current values.
// CPU reserved = running + starting only (paused VMs release CPU). // CPU reserved = running + starting only (paused VMs release CPU).
// RAM reserved = running + starting + ceil(paused/2) (capacity held for resume). // RAM reserved = running + starting + sum(ceil(each_paused/2)) (per-VM ceiling).
func (q *Queries) GetLiveMetrics(ctx context.Context, teamID string) (GetLiveMetricsRow, error) { func (q *Queries) GetLiveMetrics(ctx context.Context, teamID string) (GetLiveMetricsRow, error) {
row := q.db.QueryRow(ctx, getLiveMetrics, teamID) row := q.db.QueryRow(ctx, getLiveMetrics, teamID)
var i GetLiveMetricsRow var i GetLiveMetricsRow
@ -96,9 +96,8 @@ SELECT
(COUNT(*) FILTER (WHERE status IN ('running', 'starting')))::INTEGER AS running_count, (COUNT(*) FILTER (WHERE status IN ('running', 'starting')))::INTEGER AS running_count,
(COALESCE(SUM(vcpus) FILTER (WHERE status IN ('running', 'starting')), 0))::INTEGER AS vcpus_reserved, (COALESCE(SUM(vcpus) FILTER (WHERE status IN ('running', 'starting')), 0))::INTEGER AS vcpus_reserved,
(COALESCE(SUM(memory_mb) FILTER (WHERE status IN ('running', 'starting')), 0) (COALESCE(SUM(memory_mb) FILTER (WHERE status IN ('running', 'starting')), 0)
+ CEIL(COALESCE(SUM(memory_mb) FILTER (WHERE status = 'paused'), 0)::NUMERIC / 2))::INTEGER AS memory_mb_reserved + COALESCE(SUM(CEIL(memory_mb::NUMERIC / 2)) FILTER (WHERE status = 'paused'), 0))::INTEGER AS memory_mb_reserved
FROM sandboxes FROM sandboxes
WHERE status IN ('running', 'starting', 'paused')
GROUP BY team_id GROUP BY team_id
` `
@ -110,8 +109,11 @@ type SampleSandboxMetricsRow struct {
} }
// Aggregates per-team resource usage from the live sandboxes table. // Aggregates per-team resource usage from the live sandboxes table.
// Groups by all teams that have any sandbox row (including stopped) so that
// zero-value snapshots are recorded when all capsules are stopped, keeping the
// time-series charts continuous rather than trailing off into empty space.
// CPU reserved = running + starting only (paused VMs release CPU). // CPU reserved = running + starting only (paused VMs release CPU).
// RAM reserved = running + starting + ceil(paused/2) (capacity held for resume). // RAM reserved = running + starting + sum(ceil(each_paused/2)) (per-VM ceiling).
func (q *Queries) SampleSandboxMetrics(ctx context.Context) ([]SampleSandboxMetricsRow, error) { func (q *Queries) SampleSandboxMetrics(ctx context.Context) ([]SampleSandboxMetricsRow, error) {
rows, err := q.db.Query(ctx, sampleSandboxMetrics) rows, err := q.db.Query(ctx, sampleSandboxMetrics)
if err != nil { if err != nil {