fix: accurate sandbox metrics and memory management

Three issues fixed: 1. Memory metrics read host-side VmRSS of the Firecracker process, which includes guest page cache and never decreases. Replaced readMemRSS(fcPID) with readEnvdMemUsed(client) that queries envd's /metrics endpoint for guest-side total - MemAvailable. This matches neofetch and reflects actual process memory. 2. Added Firecracker balloon device (deflate_on_oom, 5s stats) and envd-side periodic page cache reclaimer (drop_caches when >80% used). Reclaimer is gated by snapshot_in_progress flag with sync() before freeze to prevent memory corruption during pause. 3. Sampling interval 500ms → 1s, ring buffer capacities adjusted to maintain same time windows. Reduces per-host HTTP load from 240 calls/sec to 120 calls/sec at 120 capsules. Also: maxDiffGenerations 8 → 1 (merge every re-pause since UFFD lazy-loads anyway), envd mem_used formula uses total - available.
2026-05-03 12:19:01 +06:00
parent 233e747d5d
commit 1178ab8b21
11 changed files with 157 additions and 45 deletions
--- a/internal/sandbox/metrics.go
+++ b/internal/sandbox/metrics.go
@ -15,11 +15,11 @@ type MetricPoint struct {

 // Ring buffer capacity constants.
 const (
-	ring10mCap = 1200 // 500ms × 1200 = 10 min
-	ring2hCap  = 240  // 30s × 240 = 2 h
-	ring24hCap = 288  // 5min × 288 = 24 h
+	ring10mCap = 600 // 1s × 600 = 10 min
+	ring2hCap  = 240 // 30s × 240 = 2 h
+	ring24hCap = 288 // 5min × 288 = 24 h

-	downsample2hEvery  = 60 // 60 × 500ms = 30s
+	downsample2hEvery  = 30 // 30 × 1s = 30s
 	downsample24hEvery = 10 // 10 × 30s = 5min
 )

@ -44,8 +44,8 @@ type metricsRing struct {
 	count24h int

 	// Accumulators for downsampling.
-	acc500ms  [downsample2hEvery]MetricPoint
-	acc500msN int
+	acc1s  [downsample2hEvery]MetricPoint
+	acc1sN int

 	acc30s  [downsample24hEvery]MetricPoint
 	acc30sN int
@ -56,7 +56,7 @@ func newMetricsRing() *metricsRing {
 	return &metricsRing{}
 }

-// Push adds a 500ms sample to the finest tier and triggers downsampling
+// Push adds a 1s sample to the finest tier and triggers downsampling
 // into coarser tiers when enough samples have accumulated.
 func (r *metricsRing) Push(p MetricPoint) {
 	r.mu.Lock()
@ -70,12 +70,12 @@ func (r *metricsRing) Push(p MetricPoint) {
 	}

 	// Accumulate for 2h downsample.
-	r.acc500ms[r.acc500msN] = p
-	r.acc500msN++
-	if r.acc500msN == downsample2hEvery {
-		avg := averagePoints(r.acc500ms[:downsample2hEvery])
+	r.acc1s[r.acc1sN] = p
+	r.acc1sN++
+	if r.acc1sN == downsample2hEvery {
+		avg := averagePoints(r.acc1s[:downsample2hEvery])
 		r.push2h(avg)
-		r.acc500msN = 0
+		r.acc1sN = 0
 	}
 }

@ -138,7 +138,7 @@ func (r *metricsRing) Flush() (pts10m, pts2h, pts24h []MetricPoint) {
 	r.idx10m, r.count10m = 0, 0
 	r.idx2h, r.count2h = 0, 0
 	r.idx24h, r.count24h = 0, 0
-	r.acc500msN = 0
+	r.acc1sN = 0
 	r.acc30sN = 0

 	return pts10m, pts2h, pts24h