forked from wrenn/wrenn
Add per-sandbox CPU/memory/disk metrics collection
Samples /proc/{fc_pid}/stat (CPU%), /proc/{fc_pid}/status (VmRSS), and
stat() on CoW files at 500ms intervals per running sandbox. Three tiered
ring buffers downsample into 30s and 5min averages for 10min/2h/24h
retention. Metrics are flushed to DB on pause (all tiers) and destroy
(24h only). New GetSandboxMetrics and FlushSandboxMetrics RPCs on the
host agent, proxied through GET /v1/sandboxes/{id}/metrics?range= on
the control plane. Returns live data for running sandboxes, DB data for
paused, and 404 for stopped.
This commit is contained in:
130
internal/api/handlers_metrics.go
Normal file
130
internal/api/handlers_metrics.go
Normal file
@ -0,0 +1,130 @@
|
||||
package api
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/http"
|
||||
|
||||
"connectrpc.com/connect"
|
||||
"github.com/go-chi/chi/v5"
|
||||
|
||||
"git.omukk.dev/wrenn/sandbox/internal/auth"
|
||||
"git.omukk.dev/wrenn/sandbox/internal/db"
|
||||
"git.omukk.dev/wrenn/sandbox/internal/lifecycle"
|
||||
pb "git.omukk.dev/wrenn/sandbox/proto/hostagent/gen"
|
||||
)
|
||||
|
||||
type sandboxMetricsHandler struct {
|
||||
db *db.Queries
|
||||
pool *lifecycle.HostClientPool
|
||||
}
|
||||
|
||||
func newSandboxMetricsHandler(db *db.Queries, pool *lifecycle.HostClientPool) *sandboxMetricsHandler {
|
||||
return &sandboxMetricsHandler{db: db, pool: pool}
|
||||
}
|
||||
|
||||
type metricPointResponse struct {
|
||||
TimestampUnix int64 `json:"timestamp_unix"`
|
||||
CPUPct float64 `json:"cpu_pct"`
|
||||
MemBytes int64 `json:"mem_bytes"`
|
||||
DiskBytes int64 `json:"disk_bytes"`
|
||||
}
|
||||
|
||||
type metricsResponse struct {
|
||||
SandboxID string `json:"sandbox_id"`
|
||||
Range string `json:"range"`
|
||||
Points []metricPointResponse `json:"points"`
|
||||
}
|
||||
|
||||
// GetMetrics handles GET /v1/sandboxes/{id}/metrics?range=10m|2h|24h.
|
||||
func (h *sandboxMetricsHandler) GetMetrics(w http.ResponseWriter, r *http.Request) {
|
||||
sandboxID := chi.URLParam(r, "id")
|
||||
ctx := r.Context()
|
||||
ac := auth.MustFromContext(ctx)
|
||||
|
||||
rangeTier := r.URL.Query().Get("range")
|
||||
if rangeTier == "" {
|
||||
rangeTier = "10m"
|
||||
}
|
||||
if rangeTier != "10m" && rangeTier != "2h" && rangeTier != "24h" {
|
||||
writeError(w, http.StatusBadRequest, "invalid_request", "range must be 10m, 2h, or 24h")
|
||||
return
|
||||
}
|
||||
|
||||
sb, err := h.db.GetSandboxByTeam(ctx, db.GetSandboxByTeamParams{ID: sandboxID, TeamID: ac.TeamID})
|
||||
if err != nil {
|
||||
writeError(w, http.StatusNotFound, "not_found", "sandbox not found")
|
||||
return
|
||||
}
|
||||
|
||||
switch sb.Status {
|
||||
case "running":
|
||||
h.getFromAgent(w, r, sandboxID, rangeTier, sb.HostID)
|
||||
case "paused":
|
||||
h.getFromDB(ctx, w, sandboxID, rangeTier)
|
||||
default:
|
||||
writeError(w, http.StatusNotFound, "not_found", "metrics not available for sandbox in state: "+sb.Status)
|
||||
}
|
||||
}
|
||||
|
||||
func (h *sandboxMetricsHandler) getFromAgent(w http.ResponseWriter, r *http.Request, sandboxID, rangeTier, hostID string) {
|
||||
ctx := r.Context()
|
||||
|
||||
agent, err := agentForHost(ctx, h.db, h.pool, hostID)
|
||||
if err != nil {
|
||||
writeError(w, http.StatusServiceUnavailable, "host_unavailable", "sandbox host is not reachable")
|
||||
return
|
||||
}
|
||||
|
||||
resp, err := agent.GetSandboxMetrics(ctx, connect.NewRequest(&pb.GetSandboxMetricsRequest{
|
||||
SandboxId: sandboxID,
|
||||
Range: rangeTier,
|
||||
}))
|
||||
if err != nil {
|
||||
status, code, msg := agentErrToHTTP(err)
|
||||
writeError(w, status, code, msg)
|
||||
return
|
||||
}
|
||||
|
||||
points := make([]metricPointResponse, len(resp.Msg.Points))
|
||||
for i, p := range resp.Msg.Points {
|
||||
points[i] = metricPointResponse{
|
||||
TimestampUnix: p.TimestampUnix,
|
||||
CPUPct: p.CpuPct,
|
||||
MemBytes: p.MemBytes,
|
||||
DiskBytes: p.DiskBytes,
|
||||
}
|
||||
}
|
||||
|
||||
writeJSON(w, http.StatusOK, metricsResponse{
|
||||
SandboxID: sandboxID,
|
||||
Range: rangeTier,
|
||||
Points: points,
|
||||
})
|
||||
}
|
||||
|
||||
func (h *sandboxMetricsHandler) getFromDB(ctx context.Context, w http.ResponseWriter, sandboxID, rangeTier string) {
|
||||
rows, err := h.db.GetSandboxMetricPoints(ctx, db.GetSandboxMetricPointsParams{
|
||||
SandboxID: sandboxID,
|
||||
Tier: rangeTier,
|
||||
})
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, "internal_error", "failed to read metrics")
|
||||
return
|
||||
}
|
||||
|
||||
points := make([]metricPointResponse, len(rows))
|
||||
for i, row := range rows {
|
||||
points[i] = metricPointResponse{
|
||||
TimestampUnix: row.Ts,
|
||||
CPUPct: row.CpuPct,
|
||||
MemBytes: row.MemBytes,
|
||||
DiskBytes: row.DiskBytes,
|
||||
}
|
||||
}
|
||||
|
||||
writeJSON(w, http.StatusOK, metricsResponse{
|
||||
SandboxID: sandboxID,
|
||||
Range: rangeTier,
|
||||
Points: points,
|
||||
})
|
||||
}
|
||||
@ -751,6 +751,60 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
|
||||
/v1/sandboxes/{id}/metrics:
|
||||
parameters:
|
||||
- name: id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
|
||||
get:
|
||||
summary: Get per-sandbox resource metrics
|
||||
operationId: getSandboxMetrics
|
||||
tags: [sandboxes]
|
||||
security:
|
||||
- apiKeyAuth: []
|
||||
- bearerAuth: []
|
||||
description: |
|
||||
Returns time-series CPU, memory, and disk metrics for a sandbox.
|
||||
Three tiers are available with different granularity and retention:
|
||||
- `10m`: 500ms samples, last 10 minutes
|
||||
- `2h`: 30-second averages, last 2 hours
|
||||
- `24h`: 5-minute averages, last 24 hours
|
||||
|
||||
For running sandboxes, data comes from the host agent's in-memory
|
||||
ring buffer. For paused sandboxes, data is read from persisted
|
||||
snapshots in the database. Stopped/destroyed sandboxes return 404.
|
||||
parameters:
|
||||
- name: range
|
||||
in: query
|
||||
required: false
|
||||
schema:
|
||||
type: string
|
||||
enum: ["10m", "2h", "24h"]
|
||||
default: "10m"
|
||||
description: Time range tier to query
|
||||
responses:
|
||||
"200":
|
||||
description: Metrics retrieved
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/SandboxMetrics"
|
||||
"400":
|
||||
description: Invalid range parameter
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"404":
|
||||
description: Sandbox not found or metrics not available
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
|
||||
/v1/sandboxes/{id}/pause:
|
||||
parameters:
|
||||
- name: id
|
||||
@ -1981,6 +2035,38 @@ components:
|
||||
items:
|
||||
$ref: "#/components/schemas/TeamMember"
|
||||
|
||||
SandboxMetrics:
|
||||
type: object
|
||||
properties:
|
||||
sandbox_id:
|
||||
type: string
|
||||
range:
|
||||
type: string
|
||||
enum: ["10m", "2h", "24h"]
|
||||
points:
|
||||
type: array
|
||||
items:
|
||||
$ref: "#/components/schemas/MetricPoint"
|
||||
|
||||
MetricPoint:
|
||||
type: object
|
||||
properties:
|
||||
timestamp_unix:
|
||||
type: integer
|
||||
format: int64
|
||||
cpu_pct:
|
||||
type: number
|
||||
format: double
|
||||
description: "CPU utilization percentage (0-100), normalized to vCPU count"
|
||||
mem_bytes:
|
||||
type: integer
|
||||
format: int64
|
||||
description: "Resident memory in bytes (VmRSS of Firecracker process)"
|
||||
disk_bytes:
|
||||
type: integer
|
||||
format: int64
|
||||
description: "Allocated disk bytes for the CoW sparse file"
|
||||
|
||||
Error:
|
||||
type: object
|
||||
properties:
|
||||
|
||||
@ -64,6 +64,7 @@ func New(
|
||||
usersH := newUsersHandler(teamSvc)
|
||||
auditH := newAuditHandler(auditSvc)
|
||||
statsH := newStatsHandler(statsSvc)
|
||||
metricsH := newSandboxMetricsHandler(queries, pool)
|
||||
|
||||
// OpenAPI spec and docs.
|
||||
r.Get("/openapi.yaml", serveOpenAPI)
|
||||
@ -125,6 +126,7 @@ func New(
|
||||
r.Post("/files/read", files.Download)
|
||||
r.Post("/files/stream/write", filesStream.StreamUpload)
|
||||
r.Post("/files/stream/read", filesStream.StreamDownload)
|
||||
r.Get("/metrics", metricsH.GetMetrics)
|
||||
})
|
||||
})
|
||||
|
||||
|
||||
Reference in New Issue
Block a user