forked from wrenn/wrenn
Add per-sandbox CPU/memory/disk metrics collection
Samples /proc/{fc_pid}/stat (CPU%), /proc/{fc_pid}/status (VmRSS), and
stat() on CoW files at 500ms intervals per running sandbox. Three tiered
ring buffers downsample into 30s and 5min averages for 10min/2h/24h
retention. Metrics are flushed to DB on pause (all tiers) and destroy
(24h only). New GetSandboxMetrics and FlushSandboxMetrics RPCs on the
host agent, proxied through GET /v1/sandboxes/{id}/metrics?range= on
the control plane. Returns live data for running sandboxes, DB data for
paused, and 404 for stopped.
This commit is contained in:
@ -58,6 +58,12 @@ type sandboxState struct {
|
||||
// sandbox was restored. Non-nil means re-pause should use "Diff" snapshot
|
||||
// type instead of "Full", avoiding the UFFD fault-in storm.
|
||||
parent *snapshotParent
|
||||
|
||||
// Metrics sampling state.
|
||||
fcPID int // Firecracker process PID (child of unshare wrapper)
|
||||
ring *metricsRing // tiered ring buffers for CPU/mem/disk metrics
|
||||
samplerCancel context.CancelFunc // cancels the per-sandbox sampling goroutine
|
||||
samplerDone chan struct{} // closed when the sampling goroutine exits
|
||||
}
|
||||
|
||||
// snapshotParent stores the previous generation's snapshot state so that
|
||||
@ -232,6 +238,8 @@ func (m *Manager) Create(ctx context.Context, sandboxID, template string, vcpus,
|
||||
m.boxes[sandboxID] = sb
|
||||
m.mu.Unlock()
|
||||
|
||||
m.startSampler(sb)
|
||||
|
||||
slog.Info("sandbox created",
|
||||
"id", sandboxID,
|
||||
"template", template,
|
||||
@ -265,6 +273,7 @@ func (m *Manager) Destroy(ctx context.Context, sandboxID string) error {
|
||||
|
||||
// cleanup tears down all resources for a sandbox.
|
||||
func (m *Manager) cleanup(ctx context.Context, sb *sandboxState) {
|
||||
m.stopSampler(sb)
|
||||
if err := m.vm.Destroy(ctx, sb.ID); err != nil {
|
||||
slog.Warn("vm destroy error", "id", sb.ID, "error", err)
|
||||
}
|
||||
@ -668,6 +677,8 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string, timeoutSec int)
|
||||
m.boxes[sandboxID] = sb
|
||||
m.mu.Unlock()
|
||||
|
||||
m.startSampler(sb)
|
||||
|
||||
// Don't delete snapshot dir — diff files are needed for re-pause.
|
||||
// The CoW file was already moved out. The dir will be cleaned up
|
||||
// on destroy or overwritten on re-pause.
|
||||
@ -987,6 +998,8 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID, snapshotNam
|
||||
m.boxes[sandboxID] = sb
|
||||
m.mu.Unlock()
|
||||
|
||||
m.startSampler(sb)
|
||||
|
||||
slog.Info("sandbox created from snapshot",
|
||||
"id", sandboxID,
|
||||
"snapshot", snapshotName,
|
||||
@ -1213,6 +1226,158 @@ func warnErr(msg string, id string, err error) {
|
||||
}
|
||||
}
|
||||
|
||||
// startSampler resolves the Firecracker child PID and starts a background
|
||||
// goroutine that samples CPU/mem/disk at 500ms intervals into the ring buffer.
|
||||
// Must be called after the sandbox is registered in m.boxes.
|
||||
func (m *Manager) startSampler(sb *sandboxState) {
|
||||
// Resolve the Firecracker PID (child of unshare wrapper).
|
||||
v, ok := m.vm.Get(sb.ID)
|
||||
if !ok {
|
||||
slog.Warn("metrics: VM not found, skipping sampler", "id", sb.ID)
|
||||
return
|
||||
}
|
||||
unshPID := v.PID()
|
||||
|
||||
var fcPID int
|
||||
for attempt := 0; attempt < 5; attempt++ {
|
||||
var err error
|
||||
fcPID, err = findChildPID(unshPID)
|
||||
if err == nil {
|
||||
break
|
||||
}
|
||||
if attempt == 4 {
|
||||
slog.Warn("metrics: could not resolve FC PID, skipping sampler", "id", sb.ID, "error", err)
|
||||
return
|
||||
}
|
||||
time.Sleep(50 * time.Millisecond)
|
||||
}
|
||||
|
||||
sb.fcPID = fcPID
|
||||
sb.ring = newMetricsRing()
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
sb.samplerCancel = cancel
|
||||
sb.samplerDone = make(chan struct{})
|
||||
|
||||
// Read initial CPU counters for delta calculation.
|
||||
// Passed to goroutine as local state — no shared mutation.
|
||||
initialCPU, err := readCPUStat(fcPID)
|
||||
if err != nil {
|
||||
slog.Warn("metrics: could not read initial CPU stat", "id", sb.ID, "error", err)
|
||||
}
|
||||
|
||||
go m.samplerLoop(ctx, sb, fcPID, sb.VCPUs, initialCPU)
|
||||
}
|
||||
|
||||
// samplerLoop samples /proc metrics at 500ms intervals.
|
||||
// lastCPU is goroutine-local to avoid shared-state races.
|
||||
func (m *Manager) samplerLoop(ctx context.Context, sb *sandboxState, fcPID, vcpus int, lastCPU cpuStat) {
|
||||
defer close(sb.samplerDone)
|
||||
|
||||
ticker := time.NewTicker(500 * time.Millisecond)
|
||||
defer ticker.Stop()
|
||||
|
||||
clkTck := 100.0 // sysconf(_SC_CLK_TCK), almost always 100 on Linux
|
||||
lastTime := time.Now()
|
||||
cpuInitialized := lastCPU != (cpuStat{})
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case now := <-ticker.C:
|
||||
elapsed := now.Sub(lastTime).Seconds()
|
||||
lastTime = now
|
||||
|
||||
// CPU: delta jiffies / (elapsed * CLK_TCK * vcpus) * 100
|
||||
var cpuPct float64
|
||||
cur, err := readCPUStat(fcPID)
|
||||
if err == nil {
|
||||
if cpuInitialized && elapsed > 0 && vcpus > 0 {
|
||||
deltaJiffies := float64((cur.utime + cur.stime) - (lastCPU.utime + lastCPU.stime))
|
||||
cpuPct = (deltaJiffies / (elapsed * clkTck * float64(vcpus))) * 100.0
|
||||
if cpuPct > 100.0 {
|
||||
cpuPct = 100.0
|
||||
}
|
||||
if cpuPct < 0 {
|
||||
cpuPct = 0
|
||||
}
|
||||
}
|
||||
lastCPU = cur
|
||||
cpuInitialized = true
|
||||
}
|
||||
|
||||
// Memory: VmRSS of the Firecracker process.
|
||||
memBytes, _ := readMemRSS(fcPID)
|
||||
|
||||
// Disk: allocated bytes of the CoW sparse file.
|
||||
var diskBytes int64
|
||||
if sb.dmDevice != nil {
|
||||
diskBytes, _ = readDiskAllocated(sb.dmDevice.CowPath)
|
||||
}
|
||||
|
||||
sb.ring.Push(MetricPoint{
|
||||
Timestamp: now,
|
||||
CPUPct: cpuPct,
|
||||
MemBytes: memBytes,
|
||||
DiskBytes: diskBytes,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// stopSampler stops the metrics sampling goroutine and waits for it to exit.
|
||||
func (m *Manager) stopSampler(sb *sandboxState) {
|
||||
if sb.samplerCancel != nil {
|
||||
sb.samplerCancel()
|
||||
<-sb.samplerDone
|
||||
sb.samplerCancel = nil
|
||||
}
|
||||
}
|
||||
|
||||
// GetMetrics returns the ring buffer data for the given range tier.
|
||||
// Valid ranges: "10m", "2h", "24h".
|
||||
func (m *Manager) GetMetrics(sandboxID, rangeTier string) ([]MetricPoint, error) {
|
||||
m.mu.RLock()
|
||||
sb, ok := m.boxes[sandboxID]
|
||||
m.mu.RUnlock()
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("sandbox not found: %s", sandboxID)
|
||||
}
|
||||
if sb.ring == nil {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
switch rangeTier {
|
||||
case "10m":
|
||||
return sb.ring.Get10m(), nil
|
||||
case "2h":
|
||||
return sb.ring.Get2h(), nil
|
||||
case "24h":
|
||||
return sb.ring.Get24h(), nil
|
||||
default:
|
||||
return nil, fmt.Errorf("invalid range: %s (valid: 10m, 2h, 24h)", rangeTier)
|
||||
}
|
||||
}
|
||||
|
||||
// FlushMetrics returns all three tier ring buffers, clears the ring, and
|
||||
// stops the sampler goroutine. Called by the control plane before pause/destroy.
|
||||
func (m *Manager) FlushMetrics(sandboxID string) (pts10m, pts2h, pts24h []MetricPoint, err error) {
|
||||
m.mu.RLock()
|
||||
sb, ok := m.boxes[sandboxID]
|
||||
m.mu.RUnlock()
|
||||
if !ok {
|
||||
return nil, nil, nil, fmt.Errorf("sandbox not found: %s", sandboxID)
|
||||
}
|
||||
|
||||
m.stopSampler(sb)
|
||||
if sb.ring == nil {
|
||||
return nil, nil, nil, nil
|
||||
}
|
||||
pts10m, pts2h, pts24h = sb.ring.Flush()
|
||||
return pts10m, pts2h, pts24h, nil
|
||||
}
|
||||
|
||||
// copyFile copies a regular file from src to dst using streaming I/O.
|
||||
func copyFile(src, dst string) error {
|
||||
sf, err := os.Open(src)
|
||||
|
||||
178
internal/sandbox/metrics.go
Normal file
178
internal/sandbox/metrics.go
Normal file
@ -0,0 +1,178 @@
|
||||
package sandbox
|
||||
|
||||
import (
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// MetricPoint holds one metrics sample.
|
||||
type MetricPoint struct {
|
||||
Timestamp time.Time
|
||||
CPUPct float64
|
||||
MemBytes int64
|
||||
DiskBytes int64
|
||||
}
|
||||
|
||||
// Ring buffer capacity constants.
|
||||
const (
|
||||
ring10mCap = 1200 // 500ms × 1200 = 10 min
|
||||
ring2hCap = 240 // 30s × 240 = 2 h
|
||||
ring24hCap = 288 // 5min × 288 = 24 h
|
||||
|
||||
downsample2hEvery = 60 // 60 × 500ms = 30s
|
||||
downsample24hEvery = 10 // 10 × 30s = 5min
|
||||
)
|
||||
|
||||
// metricsRing holds three tiered ring buffers with automatic downsampling
|
||||
// from the finest tier into coarser tiers.
|
||||
type metricsRing struct {
|
||||
mu sync.Mutex
|
||||
|
||||
// 10-minute tier: 500ms samples.
|
||||
buf10m [ring10mCap]MetricPoint
|
||||
idx10m int
|
||||
count10m int
|
||||
|
||||
// 2-hour tier: 30s averages.
|
||||
buf2h [ring2hCap]MetricPoint
|
||||
idx2h int
|
||||
count2h int
|
||||
|
||||
// 24-hour tier: 5min averages.
|
||||
buf24h [ring24hCap]MetricPoint
|
||||
idx24h int
|
||||
count24h int
|
||||
|
||||
// Accumulators for downsampling.
|
||||
acc500ms [downsample2hEvery]MetricPoint
|
||||
acc500msN int
|
||||
|
||||
acc30s [downsample24hEvery]MetricPoint
|
||||
acc30sN int
|
||||
}
|
||||
|
||||
// newMetricsRing creates an empty metrics ring buffer.
|
||||
func newMetricsRing() *metricsRing {
|
||||
return &metricsRing{}
|
||||
}
|
||||
|
||||
// Push adds a 500ms sample to the finest tier and triggers downsampling
|
||||
// into coarser tiers when enough samples have accumulated.
|
||||
func (r *metricsRing) Push(p MetricPoint) {
|
||||
r.mu.Lock()
|
||||
defer r.mu.Unlock()
|
||||
|
||||
// Write to 10m ring.
|
||||
r.buf10m[r.idx10m] = p
|
||||
r.idx10m = (r.idx10m + 1) % ring10mCap
|
||||
if r.count10m < ring10mCap {
|
||||
r.count10m++
|
||||
}
|
||||
|
||||
// Accumulate for 2h downsample.
|
||||
r.acc500ms[r.acc500msN] = p
|
||||
r.acc500msN++
|
||||
if r.acc500msN == downsample2hEvery {
|
||||
avg := averagePoints(r.acc500ms[:downsample2hEvery])
|
||||
r.push2h(avg)
|
||||
r.acc500msN = 0
|
||||
}
|
||||
}
|
||||
|
||||
func (r *metricsRing) push2h(p MetricPoint) {
|
||||
r.buf2h[r.idx2h] = p
|
||||
r.idx2h = (r.idx2h + 1) % ring2hCap
|
||||
if r.count2h < ring2hCap {
|
||||
r.count2h++
|
||||
}
|
||||
|
||||
// Accumulate for 24h downsample.
|
||||
r.acc30s[r.acc30sN] = p
|
||||
r.acc30sN++
|
||||
if r.acc30sN == downsample24hEvery {
|
||||
avg := averagePoints(r.acc30s[:downsample24hEvery])
|
||||
r.push24h(avg)
|
||||
r.acc30sN = 0
|
||||
}
|
||||
}
|
||||
|
||||
func (r *metricsRing) push24h(p MetricPoint) {
|
||||
r.buf24h[r.idx24h] = p
|
||||
r.idx24h = (r.idx24h + 1) % ring24hCap
|
||||
if r.count24h < ring24hCap {
|
||||
r.count24h++
|
||||
}
|
||||
}
|
||||
|
||||
// Get10m returns the 10-minute tier points in chronological order.
|
||||
func (r *metricsRing) Get10m() []MetricPoint {
|
||||
r.mu.Lock()
|
||||
defer r.mu.Unlock()
|
||||
return r.readRing(r.buf10m[:], r.idx10m, r.count10m)
|
||||
}
|
||||
|
||||
// Get2h returns the 2-hour tier points in chronological order.
|
||||
func (r *metricsRing) Get2h() []MetricPoint {
|
||||
r.mu.Lock()
|
||||
defer r.mu.Unlock()
|
||||
return r.readRing(r.buf2h[:], r.idx2h, r.count2h)
|
||||
}
|
||||
|
||||
// Get24h returns the 24-hour tier points in chronological order.
|
||||
func (r *metricsRing) Get24h() []MetricPoint {
|
||||
r.mu.Lock()
|
||||
defer r.mu.Unlock()
|
||||
return r.readRing(r.buf24h[:], r.idx24h, r.count24h)
|
||||
}
|
||||
|
||||
// Flush returns all three tiers and resets the ring buffer.
|
||||
func (r *metricsRing) Flush() (pts10m, pts2h, pts24h []MetricPoint) {
|
||||
r.mu.Lock()
|
||||
defer r.mu.Unlock()
|
||||
|
||||
pts10m = r.readRing(r.buf10m[:], r.idx10m, r.count10m)
|
||||
pts2h = r.readRing(r.buf2h[:], r.idx2h, r.count2h)
|
||||
pts24h = r.readRing(r.buf24h[:], r.idx24h, r.count24h)
|
||||
|
||||
// Reset all state.
|
||||
r.idx10m, r.count10m = 0, 0
|
||||
r.idx2h, r.count2h = 0, 0
|
||||
r.idx24h, r.count24h = 0, 0
|
||||
r.acc500msN = 0
|
||||
r.acc30sN = 0
|
||||
|
||||
return pts10m, pts2h, pts24h
|
||||
}
|
||||
|
||||
// readRing extracts elements from a circular buffer in chronological order.
|
||||
func (r *metricsRing) readRing(buf []MetricPoint, nextIdx, count int) []MetricPoint {
|
||||
if count == 0 {
|
||||
return nil
|
||||
}
|
||||
result := make([]MetricPoint, count)
|
||||
bufLen := len(buf)
|
||||
start := (nextIdx - count + bufLen) % bufLen
|
||||
for i := range count {
|
||||
result[i] = buf[(start+i)%bufLen]
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// averagePoints computes the average of a slice of MetricPoints.
|
||||
// The timestamp is set to the last point's timestamp.
|
||||
func averagePoints(pts []MetricPoint) MetricPoint {
|
||||
n := float64(len(pts))
|
||||
var cpu float64
|
||||
var mem, disk int64
|
||||
for _, p := range pts {
|
||||
cpu += p.CPUPct
|
||||
mem += p.MemBytes
|
||||
disk += p.DiskBytes
|
||||
}
|
||||
return MetricPoint{
|
||||
Timestamp: pts[len(pts)-1].Timestamp,
|
||||
CPUPct: cpu / n,
|
||||
MemBytes: int64(float64(mem) / n),
|
||||
DiskBytes: int64(float64(disk) / n),
|
||||
}
|
||||
}
|
||||
105
internal/sandbox/proc.go
Normal file
105
internal/sandbox/proc.go
Normal file
@ -0,0 +1,105 @@
|
||||
package sandbox
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
"syscall"
|
||||
)
|
||||
|
||||
// findChildPID reads the direct child PID of a given parent process.
|
||||
// The Firecracker process is a direct child of the unshare wrapper because
|
||||
// the init script uses `exec ip netns exec ... firecracker`, which replaces
|
||||
// bash with ip-netns-exec, which in turn execs firecracker — same PID,
|
||||
// direct child of unshare.
|
||||
func findChildPID(parentPID int) (int, error) {
|
||||
path := fmt.Sprintf("/proc/%d/task/%d/children", parentPID, parentPID)
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("read children: %w", err)
|
||||
}
|
||||
fields := strings.Fields(string(data))
|
||||
if len(fields) == 0 {
|
||||
return 0, fmt.Errorf("no child processes found for PID %d", parentPID)
|
||||
}
|
||||
pid, err := strconv.Atoi(fields[0])
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("parse child PID %q: %w", fields[0], err)
|
||||
}
|
||||
return pid, nil
|
||||
}
|
||||
|
||||
// cpuStat holds raw CPU jiffies read from /proc/{pid}/stat.
|
||||
type cpuStat struct {
|
||||
utime uint64
|
||||
stime uint64
|
||||
}
|
||||
|
||||
// readCPUStat reads user and system CPU jiffies from /proc/{pid}/stat.
|
||||
// Fields 14 (utime) and 15 (stime) are 1-indexed in the man page;
|
||||
// after splitting on space, they are at indices 13 and 14.
|
||||
func readCPUStat(pid int) (cpuStat, error) {
|
||||
path := fmt.Sprintf("/proc/%d/stat", pid)
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return cpuStat{}, fmt.Errorf("read stat: %w", err)
|
||||
}
|
||||
|
||||
// /proc/{pid}/stat format: pid (comm) state fields...
|
||||
// The comm field may contain spaces and parens, so find the last ')' first.
|
||||
content := string(data)
|
||||
idx := strings.LastIndex(content, ")")
|
||||
if idx < 0 {
|
||||
return cpuStat{}, fmt.Errorf("malformed /proc/%d/stat: no closing paren", pid)
|
||||
}
|
||||
// After ")" there is " state field3 field4 ... fieldN"
|
||||
// field1 after ')' is state (index 0), utime is field 11, stime is field 12
|
||||
// (0-indexed from after the closing paren).
|
||||
fields := strings.Fields(content[idx+2:])
|
||||
if len(fields) < 13 {
|
||||
return cpuStat{}, fmt.Errorf("malformed /proc/%d/stat: too few fields (%d)", pid, len(fields))
|
||||
}
|
||||
utime, err := strconv.ParseUint(fields[11], 10, 64)
|
||||
if err != nil {
|
||||
return cpuStat{}, fmt.Errorf("parse utime: %w", err)
|
||||
}
|
||||
stime, err := strconv.ParseUint(fields[12], 10, 64)
|
||||
if err != nil {
|
||||
return cpuStat{}, fmt.Errorf("parse stime: %w", err)
|
||||
}
|
||||
return cpuStat{utime: utime, stime: stime}, nil
|
||||
}
|
||||
|
||||
// readMemRSS reads VmRSS from /proc/{pid}/status and returns bytes.
|
||||
func readMemRSS(pid int) (int64, error) {
|
||||
path := fmt.Sprintf("/proc/%d/status", pid)
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("read status: %w", err)
|
||||
}
|
||||
for _, line := range strings.Split(string(data), "\n") {
|
||||
if strings.HasPrefix(line, "VmRSS:") {
|
||||
fields := strings.Fields(line)
|
||||
if len(fields) < 2 {
|
||||
return 0, fmt.Errorf("malformed VmRSS line")
|
||||
}
|
||||
kb, err := strconv.ParseInt(fields[1], 10, 64)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("parse VmRSS: %w", err)
|
||||
}
|
||||
return kb * 1024, nil
|
||||
}
|
||||
}
|
||||
return 0, fmt.Errorf("VmRSS not found in /proc/%d/status", pid)
|
||||
}
|
||||
|
||||
// readDiskAllocated returns the actual allocated bytes (not apparent size)
|
||||
// of the file at path. This uses stat's block count × 512.
|
||||
func readDiskAllocated(path string) (int64, error) {
|
||||
var stat syscall.Stat_t
|
||||
if err := syscall.Stat(path, &stat); err != nil {
|
||||
return 0, fmt.Errorf("stat %s: %w", path, err)
|
||||
}
|
||||
return stat.Blocks * 512, nil
|
||||
}
|
||||
Reference in New Issue
Block a user