1
0
forked from wrenn/wrenn

Add per-sandbox CPU/memory/disk metrics collection

Samples /proc/{fc_pid}/stat (CPU%), /proc/{fc_pid}/status (VmRSS), and
stat() on CoW files at 500ms intervals per running sandbox. Three tiered
ring buffers downsample into 30s and 5min averages for 10min/2h/24h
retention. Metrics are flushed to DB on pause (all tiers) and destroy
(24h only). New GetSandboxMetrics and FlushSandboxMetrics RPCs on the
host agent, proxied through GET /v1/sandboxes/{id}/metrics?range= on
the control plane. Returns live data for running sandboxes, DB data for
paused, and 404 for stopped.
This commit is contained in:
2026-03-25 20:10:33 +06:00
parent 7473c15f52
commit 9acdbb5ae9
16 changed files with 1430 additions and 90 deletions

View File

@ -58,6 +58,12 @@ type sandboxState struct {
// sandbox was restored. Non-nil means re-pause should use "Diff" snapshot
// type instead of "Full", avoiding the UFFD fault-in storm.
parent *snapshotParent
// Metrics sampling state.
fcPID int // Firecracker process PID (child of unshare wrapper)
ring *metricsRing // tiered ring buffers for CPU/mem/disk metrics
samplerCancel context.CancelFunc // cancels the per-sandbox sampling goroutine
samplerDone chan struct{} // closed when the sampling goroutine exits
}
// snapshotParent stores the previous generation's snapshot state so that
@ -232,6 +238,8 @@ func (m *Manager) Create(ctx context.Context, sandboxID, template string, vcpus,
m.boxes[sandboxID] = sb
m.mu.Unlock()
m.startSampler(sb)
slog.Info("sandbox created",
"id", sandboxID,
"template", template,
@ -265,6 +273,7 @@ func (m *Manager) Destroy(ctx context.Context, sandboxID string) error {
// cleanup tears down all resources for a sandbox.
func (m *Manager) cleanup(ctx context.Context, sb *sandboxState) {
m.stopSampler(sb)
if err := m.vm.Destroy(ctx, sb.ID); err != nil {
slog.Warn("vm destroy error", "id", sb.ID, "error", err)
}
@ -668,6 +677,8 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string, timeoutSec int)
m.boxes[sandboxID] = sb
m.mu.Unlock()
m.startSampler(sb)
// Don't delete snapshot dir — diff files are needed for re-pause.
// The CoW file was already moved out. The dir will be cleaned up
// on destroy or overwritten on re-pause.
@ -987,6 +998,8 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID, snapshotNam
m.boxes[sandboxID] = sb
m.mu.Unlock()
m.startSampler(sb)
slog.Info("sandbox created from snapshot",
"id", sandboxID,
"snapshot", snapshotName,
@ -1213,6 +1226,158 @@ func warnErr(msg string, id string, err error) {
}
}
// startSampler resolves the Firecracker child PID and starts a background
// goroutine that samples CPU/mem/disk at 500ms intervals into the ring buffer.
// Must be called after the sandbox is registered in m.boxes.
func (m *Manager) startSampler(sb *sandboxState) {
// Resolve the Firecracker PID (child of unshare wrapper).
v, ok := m.vm.Get(sb.ID)
if !ok {
slog.Warn("metrics: VM not found, skipping sampler", "id", sb.ID)
return
}
unshPID := v.PID()
var fcPID int
for attempt := 0; attempt < 5; attempt++ {
var err error
fcPID, err = findChildPID(unshPID)
if err == nil {
break
}
if attempt == 4 {
slog.Warn("metrics: could not resolve FC PID, skipping sampler", "id", sb.ID, "error", err)
return
}
time.Sleep(50 * time.Millisecond)
}
sb.fcPID = fcPID
sb.ring = newMetricsRing()
ctx, cancel := context.WithCancel(context.Background())
sb.samplerCancel = cancel
sb.samplerDone = make(chan struct{})
// Read initial CPU counters for delta calculation.
// Passed to goroutine as local state — no shared mutation.
initialCPU, err := readCPUStat(fcPID)
if err != nil {
slog.Warn("metrics: could not read initial CPU stat", "id", sb.ID, "error", err)
}
go m.samplerLoop(ctx, sb, fcPID, sb.VCPUs, initialCPU)
}
// samplerLoop samples /proc metrics at 500ms intervals.
// lastCPU is goroutine-local to avoid shared-state races.
func (m *Manager) samplerLoop(ctx context.Context, sb *sandboxState, fcPID, vcpus int, lastCPU cpuStat) {
defer close(sb.samplerDone)
ticker := time.NewTicker(500 * time.Millisecond)
defer ticker.Stop()
clkTck := 100.0 // sysconf(_SC_CLK_TCK), almost always 100 on Linux
lastTime := time.Now()
cpuInitialized := lastCPU != (cpuStat{})
for {
select {
case <-ctx.Done():
return
case now := <-ticker.C:
elapsed := now.Sub(lastTime).Seconds()
lastTime = now
// CPU: delta jiffies / (elapsed * CLK_TCK * vcpus) * 100
var cpuPct float64
cur, err := readCPUStat(fcPID)
if err == nil {
if cpuInitialized && elapsed > 0 && vcpus > 0 {
deltaJiffies := float64((cur.utime + cur.stime) - (lastCPU.utime + lastCPU.stime))
cpuPct = (deltaJiffies / (elapsed * clkTck * float64(vcpus))) * 100.0
if cpuPct > 100.0 {
cpuPct = 100.0
}
if cpuPct < 0 {
cpuPct = 0
}
}
lastCPU = cur
cpuInitialized = true
}
// Memory: VmRSS of the Firecracker process.
memBytes, _ := readMemRSS(fcPID)
// Disk: allocated bytes of the CoW sparse file.
var diskBytes int64
if sb.dmDevice != nil {
diskBytes, _ = readDiskAllocated(sb.dmDevice.CowPath)
}
sb.ring.Push(MetricPoint{
Timestamp: now,
CPUPct: cpuPct,
MemBytes: memBytes,
DiskBytes: diskBytes,
})
}
}
}
// stopSampler stops the metrics sampling goroutine and waits for it to exit.
func (m *Manager) stopSampler(sb *sandboxState) {
if sb.samplerCancel != nil {
sb.samplerCancel()
<-sb.samplerDone
sb.samplerCancel = nil
}
}
// GetMetrics returns the ring buffer data for the given range tier.
// Valid ranges: "10m", "2h", "24h".
func (m *Manager) GetMetrics(sandboxID, rangeTier string) ([]MetricPoint, error) {
m.mu.RLock()
sb, ok := m.boxes[sandboxID]
m.mu.RUnlock()
if !ok {
return nil, fmt.Errorf("sandbox not found: %s", sandboxID)
}
if sb.ring == nil {
return nil, nil
}
switch rangeTier {
case "10m":
return sb.ring.Get10m(), nil
case "2h":
return sb.ring.Get2h(), nil
case "24h":
return sb.ring.Get24h(), nil
default:
return nil, fmt.Errorf("invalid range: %s (valid: 10m, 2h, 24h)", rangeTier)
}
}
// FlushMetrics returns all three tier ring buffers, clears the ring, and
// stops the sampler goroutine. Called by the control plane before pause/destroy.
func (m *Manager) FlushMetrics(sandboxID string) (pts10m, pts2h, pts24h []MetricPoint, err error) {
m.mu.RLock()
sb, ok := m.boxes[sandboxID]
m.mu.RUnlock()
if !ok {
return nil, nil, nil, fmt.Errorf("sandbox not found: %s", sandboxID)
}
m.stopSampler(sb)
if sb.ring == nil {
return nil, nil, nil, nil
}
pts10m, pts2h, pts24h = sb.ring.Flush()
return pts10m, pts2h, pts24h, nil
}
// copyFile copies a regular file from src to dst using streaming I/O.
func copyFile(src, dst string) error {
sf, err := os.Open(src)