1
0
forked from wrenn/wrenn

Add per-sandbox CPU/memory/disk metrics collection

Samples /proc/{fc_pid}/stat (CPU%), /proc/{fc_pid}/status (VmRSS), and
stat() on CoW files at 500ms intervals per running sandbox. Three tiered
ring buffers downsample into 30s and 5min averages for 10min/2h/24h
retention. Metrics are flushed to DB on pause (all tiers) and destroy
(24h only). New GetSandboxMetrics and FlushSandboxMetrics RPCs on the
host agent, proxied through GET /v1/sandboxes/{id}/metrics?range= on
the control plane. Returns live data for running sandboxes, DB data for
paused, and 404 for stopped.
This commit is contained in:
2026-03-25 20:10:33 +06:00
parent 7473c15f52
commit 9acdbb5ae9
16 changed files with 1430 additions and 90 deletions

View File

@ -0,0 +1,130 @@
package api
import (
"context"
"net/http"
"connectrpc.com/connect"
"github.com/go-chi/chi/v5"
"git.omukk.dev/wrenn/sandbox/internal/auth"
"git.omukk.dev/wrenn/sandbox/internal/db"
"git.omukk.dev/wrenn/sandbox/internal/lifecycle"
pb "git.omukk.dev/wrenn/sandbox/proto/hostagent/gen"
)
type sandboxMetricsHandler struct {
db *db.Queries
pool *lifecycle.HostClientPool
}
func newSandboxMetricsHandler(db *db.Queries, pool *lifecycle.HostClientPool) *sandboxMetricsHandler {
return &sandboxMetricsHandler{db: db, pool: pool}
}
type metricPointResponse struct {
TimestampUnix int64 `json:"timestamp_unix"`
CPUPct float64 `json:"cpu_pct"`
MemBytes int64 `json:"mem_bytes"`
DiskBytes int64 `json:"disk_bytes"`
}
type metricsResponse struct {
SandboxID string `json:"sandbox_id"`
Range string `json:"range"`
Points []metricPointResponse `json:"points"`
}
// GetMetrics handles GET /v1/sandboxes/{id}/metrics?range=10m|2h|24h.
func (h *sandboxMetricsHandler) GetMetrics(w http.ResponseWriter, r *http.Request) {
sandboxID := chi.URLParam(r, "id")
ctx := r.Context()
ac := auth.MustFromContext(ctx)
rangeTier := r.URL.Query().Get("range")
if rangeTier == "" {
rangeTier = "10m"
}
if rangeTier != "10m" && rangeTier != "2h" && rangeTier != "24h" {
writeError(w, http.StatusBadRequest, "invalid_request", "range must be 10m, 2h, or 24h")
return
}
sb, err := h.db.GetSandboxByTeam(ctx, db.GetSandboxByTeamParams{ID: sandboxID, TeamID: ac.TeamID})
if err != nil {
writeError(w, http.StatusNotFound, "not_found", "sandbox not found")
return
}
switch sb.Status {
case "running":
h.getFromAgent(w, r, sandboxID, rangeTier, sb.HostID)
case "paused":
h.getFromDB(ctx, w, sandboxID, rangeTier)
default:
writeError(w, http.StatusNotFound, "not_found", "metrics not available for sandbox in state: "+sb.Status)
}
}
func (h *sandboxMetricsHandler) getFromAgent(w http.ResponseWriter, r *http.Request, sandboxID, rangeTier, hostID string) {
ctx := r.Context()
agent, err := agentForHost(ctx, h.db, h.pool, hostID)
if err != nil {
writeError(w, http.StatusServiceUnavailable, "host_unavailable", "sandbox host is not reachable")
return
}
resp, err := agent.GetSandboxMetrics(ctx, connect.NewRequest(&pb.GetSandboxMetricsRequest{
SandboxId: sandboxID,
Range: rangeTier,
}))
if err != nil {
status, code, msg := agentErrToHTTP(err)
writeError(w, status, code, msg)
return
}
points := make([]metricPointResponse, len(resp.Msg.Points))
for i, p := range resp.Msg.Points {
points[i] = metricPointResponse{
TimestampUnix: p.TimestampUnix,
CPUPct: p.CpuPct,
MemBytes: p.MemBytes,
DiskBytes: p.DiskBytes,
}
}
writeJSON(w, http.StatusOK, metricsResponse{
SandboxID: sandboxID,
Range: rangeTier,
Points: points,
})
}
func (h *sandboxMetricsHandler) getFromDB(ctx context.Context, w http.ResponseWriter, sandboxID, rangeTier string) {
rows, err := h.db.GetSandboxMetricPoints(ctx, db.GetSandboxMetricPointsParams{
SandboxID: sandboxID,
Tier: rangeTier,
})
if err != nil {
writeError(w, http.StatusInternalServerError, "internal_error", "failed to read metrics")
return
}
points := make([]metricPointResponse, len(rows))
for i, row := range rows {
points[i] = metricPointResponse{
TimestampUnix: row.Ts,
CPUPct: row.CpuPct,
MemBytes: row.MemBytes,
DiskBytes: row.DiskBytes,
}
}
writeJSON(w, http.StatusOK, metricsResponse{
SandboxID: sandboxID,
Range: rangeTier,
Points: points,
})
}

View File

@ -751,6 +751,60 @@ paths:
schema:
$ref: "#/components/schemas/Error"
/v1/sandboxes/{id}/metrics:
parameters:
- name: id
in: path
required: true
schema:
type: string
get:
summary: Get per-sandbox resource metrics
operationId: getSandboxMetrics
tags: [sandboxes]
security:
- apiKeyAuth: []
- bearerAuth: []
description: |
Returns time-series CPU, memory, and disk metrics for a sandbox.
Three tiers are available with different granularity and retention:
- `10m`: 500ms samples, last 10 minutes
- `2h`: 30-second averages, last 2 hours
- `24h`: 5-minute averages, last 24 hours
For running sandboxes, data comes from the host agent's in-memory
ring buffer. For paused sandboxes, data is read from persisted
snapshots in the database. Stopped/destroyed sandboxes return 404.
parameters:
- name: range
in: query
required: false
schema:
type: string
enum: ["10m", "2h", "24h"]
default: "10m"
description: Time range tier to query
responses:
"200":
description: Metrics retrieved
content:
application/json:
schema:
$ref: "#/components/schemas/SandboxMetrics"
"400":
description: Invalid range parameter
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"404":
description: Sandbox not found or metrics not available
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
/v1/sandboxes/{id}/pause:
parameters:
- name: id
@ -1981,6 +2035,38 @@ components:
items:
$ref: "#/components/schemas/TeamMember"
SandboxMetrics:
type: object
properties:
sandbox_id:
type: string
range:
type: string
enum: ["10m", "2h", "24h"]
points:
type: array
items:
$ref: "#/components/schemas/MetricPoint"
MetricPoint:
type: object
properties:
timestamp_unix:
type: integer
format: int64
cpu_pct:
type: number
format: double
description: "CPU utilization percentage (0-100), normalized to vCPU count"
mem_bytes:
type: integer
format: int64
description: "Resident memory in bytes (VmRSS of Firecracker process)"
disk_bytes:
type: integer
format: int64
description: "Allocated disk bytes for the CoW sparse file"
Error:
type: object
properties:

View File

@ -64,6 +64,7 @@ func New(
usersH := newUsersHandler(teamSvc)
auditH := newAuditHandler(auditSvc)
statsH := newStatsHandler(statsSvc)
metricsH := newSandboxMetricsHandler(queries, pool)
// OpenAPI spec and docs.
r.Get("/openapi.yaml", serveOpenAPI)
@ -125,6 +126,7 @@ func New(
r.Post("/files/read", files.Download)
r.Post("/files/stream/write", filesStream.StreamUpload)
r.Post("/files/stream/read", filesStream.StreamDownload)
r.Get("/metrics", metricsH.GetMetrics)
})
})