forked from wrenn/wrenn
v0.1.6 (#45)
## What's New? Performance updates for large capsules, admin panel enhancement and bug fixes ### Envd - Fixed bug with sandbox metrics calculation - Page cache drop and balloon inflation to reduce memfile snapshot - Updated rpc timeout logic for better control - Added tests ### Admin Panel - Add/Remove platform admin - Updated template deletion logic for fine grained permission ### Others - Minor frontend visual improvement - Minor bugfixes - Version bump Co-authored-by: Tasnim Kabir Sadik <tksadik92@gmail.com> Reviewed-on: wrenn/wrenn#45 Co-authored-by: pptx704 <rafeed@omukk.dev> Co-committed-by: pptx704 <rafeed@omukk.dev>
This commit is contained in:
@ -1,6 +1,7 @@
|
||||
package sandbox
|
||||
|
||||
import (
|
||||
"context"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
@ -17,6 +18,20 @@ type ConnTracker struct {
|
||||
// goroutine to exit, preventing goroutine leaks on repeated pause failures.
|
||||
cancelMu sync.Mutex
|
||||
cancelDrain chan struct{}
|
||||
|
||||
// ctx is cancelled by ForceClose to abort all in-flight proxy requests.
|
||||
// Initialized lazily on first Acquire; replaced by Reset after a failed
|
||||
// pause so new connections get a fresh, non-cancelled context.
|
||||
ctxMu sync.Mutex
|
||||
ctx context.Context
|
||||
cancel context.CancelFunc
|
||||
}
|
||||
|
||||
// ensureCtx lazily initializes the cancellable context.
|
||||
func (t *ConnTracker) ensureCtx() {
|
||||
if t.ctx == nil {
|
||||
t.ctx, t.cancel = context.WithCancel(context.Background())
|
||||
}
|
||||
}
|
||||
|
||||
// Acquire registers one in-flight connection. Returns false if the tracker
|
||||
@ -35,6 +50,16 @@ func (t *ConnTracker) Acquire() bool {
|
||||
return true
|
||||
}
|
||||
|
||||
// Context returns a context that is cancelled when ForceClose is called.
|
||||
// Proxy handlers should derive their request context from this so that
|
||||
// force-close during pause aborts in-flight proxied requests.
|
||||
func (t *ConnTracker) Context() context.Context {
|
||||
t.ctxMu.Lock()
|
||||
defer t.ctxMu.Unlock()
|
||||
t.ensureCtx()
|
||||
return t.ctx
|
||||
}
|
||||
|
||||
// Release marks one connection as complete. Must be called exactly once
|
||||
// per successful Acquire.
|
||||
func (t *ConnTracker) Release() {
|
||||
@ -65,9 +90,33 @@ func (t *ConnTracker) Drain(timeout time.Duration) {
|
||||
}
|
||||
}
|
||||
|
||||
// ForceClose cancels all in-flight proxy connections by cancelling the
|
||||
// shared context. Connections whose request context derives from Context()
|
||||
// will see their requests aborted, causing the proxy handler to return
|
||||
// and call Release(). Waits briefly for connections to actually release.
|
||||
func (t *ConnTracker) ForceClose() {
|
||||
t.ctxMu.Lock()
|
||||
if t.cancel != nil {
|
||||
t.cancel()
|
||||
}
|
||||
t.ctxMu.Unlock()
|
||||
|
||||
// Wait briefly for force-closed connections to call Release().
|
||||
done := make(chan struct{})
|
||||
go func() {
|
||||
t.wg.Wait()
|
||||
close(done)
|
||||
}()
|
||||
select {
|
||||
case <-done:
|
||||
case <-time.After(2 * time.Second):
|
||||
}
|
||||
}
|
||||
|
||||
// Reset re-enables the tracker after a failed drain. This allows the
|
||||
// sandbox to accept proxy connections again if the pause operation fails
|
||||
// and the VM is resumed. It also cancels any lingering Drain goroutine.
|
||||
// and the VM is resumed. It also cancels any lingering Drain goroutine
|
||||
// and creates a fresh context for new connections.
|
||||
func (t *ConnTracker) Reset() {
|
||||
t.cancelMu.Lock()
|
||||
if t.cancelDrain != nil {
|
||||
@ -81,5 +130,10 @@ func (t *ConnTracker) Reset() {
|
||||
}
|
||||
t.cancelMu.Unlock()
|
||||
|
||||
// Replace the cancelled context with a fresh one.
|
||||
t.ctxMu.Lock()
|
||||
t.ctx, t.cancel = context.WithCancel(context.Background())
|
||||
t.ctxMu.Unlock()
|
||||
|
||||
t.draining.Store(false)
|
||||
}
|
||||
|
||||
@ -95,10 +95,10 @@ type snapshotParent struct {
|
||||
}
|
||||
|
||||
// maxDiffGenerations caps how many incremental diff generations we chain
|
||||
// before falling back to a Full snapshot to collapse the chain. Long diff
|
||||
// chains increase restore latency and snapshot directory size; a periodic
|
||||
// Full snapshot resets the counter and produces a clean base.
|
||||
const maxDiffGenerations = 8
|
||||
// before merging diffs into a single file. Since UFFD lazy-loads memory
|
||||
// anyway, we merge on every re-pause to keep exactly 1 diff file per
|
||||
// snapshot — no accumulated chain, no extra restore overhead.
|
||||
const maxDiffGenerations = 1
|
||||
|
||||
// buildMetadata constructs the metadata map with version information.
|
||||
func (m *Manager) buildMetadata(envdVersion string) map[string]string {
|
||||
@ -186,9 +186,12 @@ func (m *Manager) Create(ctx context.Context, sandboxID string, teamID, template
|
||||
}
|
||||
|
||||
// Create dm-snapshot with per-sandbox CoW file.
|
||||
// CoW must be at least as large as the origin — if every block is
|
||||
// rewritten, the CoW stores a full copy. Undersized CoW causes
|
||||
// dm-snapshot invalidation → EIO on all guest I/O.
|
||||
dmName := "wrenn-" + sandboxID
|
||||
cowPath := filepath.Join(layout.SandboxesDir(m.cfg.WrennDir), fmt.Sprintf("%s.cow", sandboxID))
|
||||
cowSize := int64(diskSizeMB) * 1024 * 1024
|
||||
cowSize := max(int64(diskSizeMB)*1024*1024, originSize)
|
||||
dmDev, err := devicemapper.CreateSnapshot(dmName, originLoop, cowPath, originSize, cowSize)
|
||||
if err != nil {
|
||||
m.loops.Release(baseRootfs)
|
||||
@ -374,28 +377,43 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error {
|
||||
return fmt.Errorf("sandbox %s is not running (status: %s)", sandboxID, sb.Status)
|
||||
}
|
||||
|
||||
// Mark sandbox as pausing to block new exec/file/PTY operations.
|
||||
m.mu.Lock()
|
||||
sb.Status = models.StatusPausing
|
||||
m.mu.Unlock()
|
||||
|
||||
// restoreRunning reverts state if any pre-freeze step fails.
|
||||
restoreRunning := func() {
|
||||
_ = m.vm.UpdateBalloon(context.Background(), sandboxID, 0)
|
||||
sb.connTracker.Reset()
|
||||
m.mu.Lock()
|
||||
sb.Status = models.StatusRunning
|
||||
m.mu.Unlock()
|
||||
m.startSampler(sb)
|
||||
}
|
||||
|
||||
// Stop the metrics sampler goroutine before tearing down any resources
|
||||
// it reads (dm device, Firecracker PID). Without this, the sampler
|
||||
// leaks on every successful pause.
|
||||
m.stopSampler(sb)
|
||||
|
||||
// Step 0: Drain in-flight proxy connections before freezing vCPUs.
|
||||
// Stale TCP state from mid-flight connections causes issues on restore.
|
||||
sb.connTracker.Drain(2 * time.Second)
|
||||
slog.Debug("pause: proxy connections drained", "id", sandboxID)
|
||||
// ── Step 1: Isolate from external traffic ─────────────────────────
|
||||
// Drain in-flight proxy connections (grace period for clean shutdown).
|
||||
sb.connTracker.Drain(5 * time.Second)
|
||||
// Force-close any connections that didn't finish during grace period.
|
||||
sb.connTracker.ForceClose()
|
||||
slog.Debug("pause: external connections closed", "id", sandboxID)
|
||||
|
||||
// Step 0b: Close host-side idle connections to envd. Done before
|
||||
// PrepareSnapshot so FIN packets propagate to the guest during the
|
||||
// PrepareSnapshot window (no extra sleep needed).
|
||||
// Close host-side idle connections to envd so FIN packets propagate
|
||||
// to the guest kernel before snapshot.
|
||||
sb.client.CloseIdleConnections()
|
||||
slog.Debug("pause: envd client idle connections closed", "id", sandboxID)
|
||||
|
||||
// Step 0c: Signal envd to quiesce (stop port scanner/forwarder, mark
|
||||
// connections for post-restore cleanup). The 3s timeout also gives time
|
||||
// for the FINs from Step 0b to be processed by the guest kernel.
|
||||
// Best-effort: a failure is logged but does not abort the pause.
|
||||
// ── Step 2: Drop page cache ──────────────────────────────────────
|
||||
// Signal envd to quiesce: drops page cache, stops port subsystem,
|
||||
// marks connections for post-restore cleanup. Page cache drop can
|
||||
// take significant time on large-memory VMs (20GB+).
|
||||
func() {
|
||||
prepCtx, prepCancel := context.WithTimeout(ctx, 3*time.Second)
|
||||
prepCtx, prepCancel := context.WithTimeout(ctx, 30*time.Second)
|
||||
defer prepCancel()
|
||||
if err := sb.client.PrepareSnapshot(prepCtx); err != nil {
|
||||
slog.Warn("pause: pre-snapshot quiesce failed (best-effort)", "id", sandboxID, "error", err)
|
||||
@ -404,11 +422,37 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error {
|
||||
}
|
||||
}()
|
||||
|
||||
// ── Step 3: Inflate balloon to reclaim free guest memory ─────────
|
||||
// Freed pages become zero from FC's perspective, so ProcessMemfile
|
||||
// skips them → dramatically smaller memfile (e.g. 20GB → 1GB).
|
||||
func() {
|
||||
memUsed, err := readEnvdMemUsed(sb.client)
|
||||
if err != nil {
|
||||
slog.Debug("pause: could not read guest memory, skipping balloon inflate", "id", sandboxID, "error", err)
|
||||
return
|
||||
}
|
||||
usedMiB := int(memUsed / (1024 * 1024))
|
||||
keepMiB := max(usedMiB*2, 256) + 128
|
||||
inflateMiB := sb.MemoryMB - keepMiB
|
||||
if inflateMiB <= 0 {
|
||||
slog.Debug("pause: not enough free memory for balloon inflate", "id", sandboxID, "used_mib", usedMiB, "total_mib", sb.MemoryMB)
|
||||
return
|
||||
}
|
||||
balloonCtx, balloonCancel := context.WithTimeout(ctx, 10*time.Second)
|
||||
defer balloonCancel()
|
||||
if err := m.vm.UpdateBalloon(balloonCtx, sandboxID, inflateMiB); err != nil {
|
||||
slog.Debug("pause: balloon inflate failed (non-fatal)", "id", sandboxID, "error", err)
|
||||
return
|
||||
}
|
||||
time.Sleep(2 * time.Second)
|
||||
slog.Info("pause: balloon inflated", "id", sandboxID, "inflate_mib", inflateMiB, "guest_used_mib", usedMiB)
|
||||
}()
|
||||
|
||||
// ── Step 4: Freeze vCPUs ─────────────────────────────────────────
|
||||
pauseStart := time.Now()
|
||||
|
||||
// Step 1: Pause the VM (freeze vCPUs).
|
||||
if err := m.vm.Pause(ctx, sandboxID); err != nil {
|
||||
sb.connTracker.Reset()
|
||||
restoreRunning()
|
||||
return fmt.Errorf("pause VM: %w", err)
|
||||
}
|
||||
slog.Debug("pause: VM paused", "id", sandboxID, "elapsed", time.Since(pauseStart))
|
||||
@ -423,13 +467,23 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error {
|
||||
|
||||
// resumeOnError unpauses the VM so the sandbox stays usable when a
|
||||
// post-freeze step fails. If the resume itself fails, the sandbox is
|
||||
// left frozen — the caller should destroy it. It also resets the
|
||||
// connection tracker so the sandbox can accept proxy connections again.
|
||||
// frozen and unrecoverable — destroy it to avoid a zombie.
|
||||
resumeOnError := func() {
|
||||
sb.connTracker.Reset()
|
||||
if err := m.vm.Resume(ctx, sandboxID); err != nil {
|
||||
slog.Error("failed to resume VM after pause error — sandbox is frozen", "id", sandboxID, "error", err)
|
||||
// Use a fresh context — the caller's ctx may already be cancelled.
|
||||
resumeCtx, resumeCancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer resumeCancel()
|
||||
if err := m.vm.Resume(resumeCtx, sandboxID); err != nil {
|
||||
slog.Error("failed to resume VM after pause error — destroying frozen sandbox", "id", sandboxID, "error", err)
|
||||
m.cleanup(context.Background(), sb)
|
||||
m.mu.Lock()
|
||||
delete(m.boxes, sandboxID)
|
||||
m.mu.Unlock()
|
||||
if m.onDestroy != nil {
|
||||
m.onDestroy(sandboxID)
|
||||
}
|
||||
return
|
||||
}
|
||||
restoreRunning()
|
||||
}
|
||||
|
||||
// Step 2: Take VM state snapshot (snapfile + memfile).
|
||||
@ -444,6 +498,7 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error {
|
||||
|
||||
snapshotStart := time.Now()
|
||||
if err := m.vm.Snapshot(ctx, sandboxID, snapPath, rawMemPath, snapshotType); err != nil {
|
||||
slog.Error("pause: snapshot failed", "id", sandboxID, "type", snapshotType, "elapsed", time.Since(snapshotStart), "error", err)
|
||||
warnErr("snapshot dir cleanup error", sandboxID, os.RemoveAll(pauseDir))
|
||||
resumeOnError()
|
||||
return fmt.Errorf("create VM snapshot: %w", err)
|
||||
@ -795,6 +850,12 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string, timeoutSec int,
|
||||
slog.Warn("post-init failed after resume, metadata files may be stale", "sandbox", sandboxID, "error", err)
|
||||
}
|
||||
|
||||
// Deflate balloon — the snapshot was taken with an inflated balloon to
|
||||
// reduce memfile size, so restore the guest's full memory allocation.
|
||||
if err := m.vm.UpdateBalloon(ctx, sandboxID, 0); err != nil {
|
||||
slog.Debug("resume: balloon deflate failed (non-fatal)", "id", sandboxID, "error", err)
|
||||
}
|
||||
|
||||
// Fetch envd version (best-effort).
|
||||
envdVersion, _ := client.FetchVersion(ctx)
|
||||
|
||||
@ -1134,7 +1195,7 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID string, team
|
||||
|
||||
dmName := "wrenn-" + sandboxID
|
||||
cowPath := filepath.Join(layout.SandboxesDir(m.cfg.WrennDir), fmt.Sprintf("%s.cow", sandboxID))
|
||||
cowSize := int64(diskSizeMB) * 1024 * 1024
|
||||
cowSize := max(int64(diskSizeMB)*1024*1024, originSize)
|
||||
dmDev, err := devicemapper.CreateSnapshot(dmName, originLoop, cowPath, originSize, cowSize)
|
||||
if err != nil {
|
||||
source.Close()
|
||||
@ -1235,6 +1296,11 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID string, team
|
||||
slog.Warn("post-init failed after template restore, metadata files may be stale", "sandbox", sandboxID, "error", err)
|
||||
}
|
||||
|
||||
// Deflate balloon — template snapshot was taken with an inflated balloon.
|
||||
if err := m.vm.UpdateBalloon(ctx, sandboxID, 0); err != nil {
|
||||
slog.Debug("create-from-snapshot: balloon deflate failed (non-fatal)", "id", sandboxID, "error", err)
|
||||
}
|
||||
|
||||
// Fetch envd version (best-effort).
|
||||
envdVersion, _ := client.FetchVersion(ctx)
|
||||
|
||||
@ -1720,12 +1786,12 @@ func (m *Manager) startSampler(sb *sandboxState) {
|
||||
go m.samplerLoop(ctx, sb, fcPID, sb.VCPUs, initialCPU)
|
||||
}
|
||||
|
||||
// samplerLoop samples /proc metrics at 500ms intervals.
|
||||
// samplerLoop samples metrics at 1s intervals.
|
||||
// lastCPU is goroutine-local to avoid shared-state races.
|
||||
func (m *Manager) samplerLoop(ctx context.Context, sb *sandboxState, fcPID, vcpus int, lastCPU cpuStat) {
|
||||
defer close(sb.samplerDone)
|
||||
|
||||
ticker := time.NewTicker(500 * time.Millisecond)
|
||||
ticker := time.NewTicker(1 * time.Second)
|
||||
defer ticker.Stop()
|
||||
|
||||
clkTck := 100.0 // sysconf(_SC_CLK_TCK), almost always 100 on Linux
|
||||
@ -1758,8 +1824,11 @@ func (m *Manager) samplerLoop(ctx context.Context, sb *sandboxState, fcPID, vcpu
|
||||
cpuInitialized = true
|
||||
}
|
||||
|
||||
// Memory: VmRSS of the Firecracker process.
|
||||
memBytes, _ := readMemRSS(fcPID)
|
||||
// Memory: guest-reported used memory from envd /metrics.
|
||||
// VmRSS of the Firecracker process includes guest page cache
|
||||
// and never decreases, so we use the guest's own view which
|
||||
// reports total - available (actual process memory).
|
||||
memBytes, _ := readEnvdMemUsed(sb.client)
|
||||
|
||||
// Disk: allocated bytes of the CoW sparse file.
|
||||
var diskBytes int64
|
||||
|
||||
@ -15,11 +15,11 @@ type MetricPoint struct {
|
||||
|
||||
// Ring buffer capacity constants.
|
||||
const (
|
||||
ring10mCap = 1200 // 500ms × 1200 = 10 min
|
||||
ring2hCap = 240 // 30s × 240 = 2 h
|
||||
ring24hCap = 288 // 5min × 288 = 24 h
|
||||
ring10mCap = 600 // 1s × 600 = 10 min
|
||||
ring2hCap = 240 // 30s × 240 = 2 h
|
||||
ring24hCap = 288 // 5min × 288 = 24 h
|
||||
|
||||
downsample2hEvery = 60 // 60 × 500ms = 30s
|
||||
downsample2hEvery = 30 // 30 × 1s = 30s
|
||||
downsample24hEvery = 10 // 10 × 30s = 5min
|
||||
)
|
||||
|
||||
@ -44,8 +44,8 @@ type metricsRing struct {
|
||||
count24h int
|
||||
|
||||
// Accumulators for downsampling.
|
||||
acc500ms [downsample2hEvery]MetricPoint
|
||||
acc500msN int
|
||||
acc1s [downsample2hEvery]MetricPoint
|
||||
acc1sN int
|
||||
|
||||
acc30s [downsample24hEvery]MetricPoint
|
||||
acc30sN int
|
||||
@ -56,7 +56,7 @@ func newMetricsRing() *metricsRing {
|
||||
return &metricsRing{}
|
||||
}
|
||||
|
||||
// Push adds a 500ms sample to the finest tier and triggers downsampling
|
||||
// Push adds a 1s sample to the finest tier and triggers downsampling
|
||||
// into coarser tiers when enough samples have accumulated.
|
||||
func (r *metricsRing) Push(p MetricPoint) {
|
||||
r.mu.Lock()
|
||||
@ -70,12 +70,12 @@ func (r *metricsRing) Push(p MetricPoint) {
|
||||
}
|
||||
|
||||
// Accumulate for 2h downsample.
|
||||
r.acc500ms[r.acc500msN] = p
|
||||
r.acc500msN++
|
||||
if r.acc500msN == downsample2hEvery {
|
||||
avg := averagePoints(r.acc500ms[:downsample2hEvery])
|
||||
r.acc1s[r.acc1sN] = p
|
||||
r.acc1sN++
|
||||
if r.acc1sN == downsample2hEvery {
|
||||
avg := averagePoints(r.acc1s[:downsample2hEvery])
|
||||
r.push2h(avg)
|
||||
r.acc500msN = 0
|
||||
r.acc1sN = 0
|
||||
}
|
||||
}
|
||||
|
||||
@ -138,7 +138,7 @@ func (r *metricsRing) Flush() (pts10m, pts2h, pts24h []MetricPoint) {
|
||||
r.idx10m, r.count10m = 0, 0
|
||||
r.idx2h, r.count2h = 0, 0
|
||||
r.idx24h, r.count24h = 0, 0
|
||||
r.acc500msN = 0
|
||||
r.acc1sN = 0
|
||||
r.acc30sN = 0
|
||||
|
||||
return pts10m, pts2h, pts24h
|
||||
|
||||
@ -1,11 +1,15 @@
|
||||
package sandbox
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
"syscall"
|
||||
|
||||
"git.omukk.dev/wrenn/wrenn/internal/envdclient"
|
||||
)
|
||||
|
||||
// cpuStat holds raw CPU jiffies read from /proc/{pid}/stat.
|
||||
@ -24,16 +28,11 @@ func readCPUStat(pid int) (cpuStat, error) {
|
||||
return cpuStat{}, fmt.Errorf("read stat: %w", err)
|
||||
}
|
||||
|
||||
// /proc/{pid}/stat format: pid (comm) state fields...
|
||||
// The comm field may contain spaces and parens, so find the last ')' first.
|
||||
content := string(data)
|
||||
idx := strings.LastIndex(content, ")")
|
||||
if idx < 0 {
|
||||
return cpuStat{}, fmt.Errorf("malformed /proc/%d/stat: no closing paren", pid)
|
||||
}
|
||||
// After ")" there is " state field3 field4 ... fieldN"
|
||||
// field1 after ')' is state (index 0), utime is field 11, stime is field 12
|
||||
// (0-indexed from after the closing paren).
|
||||
fields := strings.Fields(content[idx+2:])
|
||||
if len(fields) < 13 {
|
||||
return cpuStat{}, fmt.Errorf("malformed /proc/%d/stat: too few fields (%d)", pid, len(fields))
|
||||
@ -49,27 +48,34 @@ func readCPUStat(pid int) (cpuStat, error) {
|
||||
return cpuStat{utime: utime, stime: stime}, nil
|
||||
}
|
||||
|
||||
// readMemRSS reads VmRSS from /proc/{pid}/status and returns bytes.
|
||||
func readMemRSS(pid int) (int64, error) {
|
||||
path := fmt.Sprintf("/proc/%d/status", pid)
|
||||
data, err := os.ReadFile(path)
|
||||
// readEnvdMemUsed fetches mem_used from envd's /metrics endpoint. Returns
|
||||
// guest-side total - MemAvailable (actual process memory, excluding reclaimable
|
||||
// page cache). VmRSS of the Firecracker process includes guest page cache and
|
||||
// never decreases, so this is the accurate metric for dashboard display.
|
||||
func readEnvdMemUsed(client *envdclient.Client) (int64, error) {
|
||||
resp, err := client.HTTPClient().Get(client.BaseURL() + "/metrics")
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("read status: %w", err)
|
||||
return 0, fmt.Errorf("fetch envd metrics: %w", err)
|
||||
}
|
||||
for _, line := range strings.Split(string(data), "\n") {
|
||||
if strings.HasPrefix(line, "VmRSS:") {
|
||||
fields := strings.Fields(line)
|
||||
if len(fields) < 2 {
|
||||
return 0, fmt.Errorf("malformed VmRSS line")
|
||||
}
|
||||
kb, err := strconv.ParseInt(fields[1], 10, 64)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("parse VmRSS: %w", err)
|
||||
}
|
||||
return kb * 1024, nil
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != 200 {
|
||||
return 0, fmt.Errorf("envd metrics: status %d", resp.StatusCode)
|
||||
}
|
||||
return 0, fmt.Errorf("VmRSS not found in /proc/%d/status", pid)
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("read envd metrics body: %w", err)
|
||||
}
|
||||
|
||||
var m struct {
|
||||
MemUsed int64 `json:"mem_used"`
|
||||
}
|
||||
if err := json.Unmarshal(body, &m); err != nil {
|
||||
return 0, fmt.Errorf("decode envd metrics: %w", err)
|
||||
}
|
||||
|
||||
return m.MemUsed, nil
|
||||
}
|
||||
|
||||
// readDiskAllocated returns the actual allocated bytes (not apparent size)
|
||||
|
||||
Reference in New Issue
Block a user