1
0
forked from wrenn/wrenn
Co-authored-by: Tasnim Kabir Sadik <tksadik@omukk.dev>

Reviewed-on: wrenn/wrenn#55
Co-authored-by: pptx704 <rafeed@omukk.dev>
Co-committed-by: pptx704 <rafeed@omukk.dev>
This commit is contained in:
2026-06-20 22:45:08 +00:00
committed by Rafeed M. Bhuiyan
parent cfc0c52010
commit a08e755e53
53 changed files with 1675 additions and 577 deletions

View File

@ -0,0 +1,111 @@
package sandbox
import (
"testing"
"git.omukk.dev/wrenn/wrenn/internal/envdclient"
)
func TestIsBusy(t *testing.T) {
tests := []struct {
name string
cfg Config
act envdclient.Activity
want bool
}{
// Default thresholds (zero cfg → defaults: cpu 5%, net 16K, disk 32K).
{"idle", Config{}, envdclient.Activity{CPUUsedPct: 0.5, NetBps: 100, DiskBps: 200}, false},
{"cpu just below", Config{}, envdclient.Activity{CPUUsedPct: 4.99}, false},
{"cpu at threshold", Config{}, envdclient.Activity{CPUUsedPct: 5.0}, true},
{"cpu above", Config{}, envdclient.Activity{CPUUsedPct: 80.0}, true},
{"net just below", Config{}, envdclient.Activity{NetBps: 16*1024 - 1}, false},
{"net at floor", Config{}, envdclient.Activity{NetBps: 16 * 1024}, true},
{"disk just below", Config{}, envdclient.Activity{DiskBps: 32*1024 - 1}, false},
{"disk at floor", Config{}, envdclient.Activity{DiskBps: 32 * 1024}, true},
{"download: low cpu, high net", Config{}, envdclient.Activity{CPUUsedPct: 1.0, NetBps: 5 * 1024 * 1024}, true},
// Explicit overrides take precedence over defaults.
{
"custom cpu threshold met",
Config{CPUBusyPct: 20.0},
envdclient.Activity{CPUUsedPct: 25.0},
true,
},
{
"custom cpu threshold not met",
Config{CPUBusyPct: 20.0},
envdclient.Activity{CPUUsedPct: 10.0},
false,
},
{
"custom net floor not met",
Config{NetFloorBps: 1024 * 1024},
envdclient.Activity{NetBps: 16 * 1024},
false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
m := &Manager{cfg: tt.cfg}
if got := m.isBusy(&tt.act); got != tt.want {
t.Errorf("isBusy(%+v) = %v, want %v", tt.act, got, tt.want)
}
})
}
}
func TestApplyBusySample(t *testing.T) {
// Debounce requires busyDebounceSamples consecutive busy samples before the
// first bump. Verify the streak math and bump timing.
if busyDebounceSamples != 2 {
t.Skip("test written for busyDebounceSamples=2")
}
tests := []struct {
name string
startStreak int
busy bool
wantStreak int
wantBump bool
}{
{"first busy, no bump yet", 0, true, 1, false},
{"second consecutive busy, bump", 1, true, 2, true},
{"sustained busy keeps bumping, streak held", 2, true, 2, true},
{"single noise spike from idle, no bump", 0, false, 0, false},
{"idle resets a building streak", 1, false, 0, false},
{"idle resets a saturated streak", 2, false, 0, false},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
gotStreak, gotBump := applyBusySample(tt.startStreak, tt.busy)
if gotStreak != tt.wantStreak || gotBump != tt.wantBump {
t.Errorf("applyBusySample(%d, %v) = (%d, %v), want (%d, %v)",
tt.startStreak, tt.busy, gotStreak, gotBump, tt.wantStreak, tt.wantBump)
}
})
}
}
// TestApplyBusySample_NoiseScenario walks a realistic sample sequence: brief
// noise never crosses the debounce, but sustained work does and then a return
// to idle resets — proving an isolated spike cannot keep a sandbox alive.
func TestApplyBusySample_NoiseScenario(t *testing.T) {
if busyDebounceSamples != 2 {
t.Skip("test written for busyDebounceSamples=2")
}
samples := []bool{true, false, false, true, true, true, false}
wantBumps := []bool{false, false, false, false, true, true, false}
streak := 0
for i, busy := range samples {
var bump bool
streak, bump = applyBusySample(streak, busy)
if bump != wantBumps[i] {
t.Errorf("sample %d (busy=%v): bump = %v, want %v (streak=%d)",
i, busy, bump, wantBumps[i], streak)
}
}
}

View File

@ -88,14 +88,47 @@ type Config struct {
EnvdTimeout time.Duration
DefaultRootfsSizeMB int // target size for template rootfs images; 0 → DefaultDiskSizeMB
// ProxyDomain is the public domain sandboxes are served under (e.g.
// "wrenn.dev"). Injected into envd at /init so `envd ports` can build
// {port}-{sandbox_id}.{domain} URLs.
ProxyDomain string
// Resolved at startup by the host agent.
KernelPath string // path to the latest vmlinux-x.y.z
KernelVersion string // semver extracted from filename
VMMBin string // path to the cloud-hypervisor binary
VMMVersion string // semver from cloud-hypervisor --version
AgentVersion string // host agent version (injected via ldflags)
// Activity sampler thresholds. The sampler polls each running sandbox's
// guest liveness and refreshes its TTL when it is doing real work, so a
// long-running but non-interactive job is not mistaken for inactive. A
// sandbox counts as busy when guest CPU ≥ CPUBusyPct, or net/disk
// throughput ≥ the respective floor (bytes/sec). Zero values fall back to
// the package defaults at sampler start.
ActivitySampleInterval time.Duration
CPUBusyPct float32
NetFloorBps uint64
DiskFloorBps uint64
}
// Activity sampler defaults. Thresholds sit clear of idle-VM background noise
// (envd's own sampler thread, guest timers) so a parked sandbox still times
// out; the debounce below guards against a lone noisy sample masquerading as
// work. All are env-overridable on the host agent.
const (
defaultActivitySampleInterval = 5 * time.Second
defaultCPUBusyPct = 5.0 // percent of total vCPU capacity
defaultNetFloorBps = 16 * 1024 // 16 KB/s
defaultDiskFloorBps = 32 * 1024 // 32 KB/s
activityPollTimeout = 3 * time.Second
activitySampleConcurrency = 16
// busyDebounceSamples is how many consecutive busy samples are required
// before the sandbox's TTL is refreshed. With a 5s interval, real work
// registers within ~10s while isolated noise spikes are ignored.
busyDebounceSamples = 2
)
// LifecycleEvent describes an autonomous state change initiated by the agent.
type LifecycleEvent struct {
Event string
@ -189,6 +222,12 @@ type sandboxState struct {
ring *metricsRing // tiered ring buffers for CPU/mem/disk metrics
samplerCancel context.CancelFunc // cancels the per-sandbox sampling goroutine
samplerDone chan struct{} // closed when the sampling goroutine exits
// activityBusyStreak counts consecutive busy activity samples. A single
// noisy sample (idle background CPU, a stray packet) must not refresh the
// TTL, so LastActiveAt is only bumped once the streak reaches
// busyDebounceSamples. Reset to 0 by any non-busy sample. Guarded by m.mu.
activityBusyStreak int
}
// buildMetadata constructs the metadata map with version information.
@ -419,14 +458,14 @@ func (m *Manager) Create(
// Fetch envd version (best-effort).
envdVersion, _ := client.FetchVersion(ctx)
// Apply template defaults via envd /init (no-op when both empty).
if defaultUser != "" || len(defaultEnv) > 0 {
initCtx, initCancel := context.WithTimeout(ctx, m.cfg.EnvdTimeout)
if err := client.PostInitWithDefaults(initCtx, defaultUser, defaultEnv, sandboxID, id.UUIDString(templateID)); err != nil {
slog.Warn("post-create PostInit failed", "id", sandboxID, "error", err)
}
initCancel()
// Apply template defaults + sandbox identity via envd /init. Always called
// on create so envd records its sandbox ID and proxy domain (used by
// `envd ports`), even when the template specifies no user/env defaults.
initCtx, initCancel := context.WithTimeout(ctx, m.cfg.EnvdTimeout)
if err := client.PostInitWithDefaults(initCtx, defaultUser, defaultEnv, sandboxID, id.UUIDString(templateID), m.cfg.ProxyDomain); err != nil {
slog.Warn("post-create PostInit failed", "id", sandboxID, "error", err)
}
initCancel()
now := time.Now()
sb := &sandboxState{
@ -667,7 +706,7 @@ func (m *Manager) SetDefaults(ctx context.Context, sandboxID, defaultUser string
if err != nil {
return err
}
return c.PostInitWithDefaults(ctx, defaultUser, defaultEnv, "", "")
return c.PostInitWithDefaults(ctx, defaultUser, defaultEnv, "", "", "")
}
// PtyAttach starts a new PTY process or reconnects to an existing one.
@ -762,6 +801,11 @@ func (m *Manager) AcquireProxyConn(sandboxID string) (net.IP, *ConnTracker, bool
if !sb.connTracker.Acquire() {
return nil, nil, false
}
// Inbound proxy traffic counts as activity: an idle web server reachable
// through the proxy should not be auto-paused while it is serving requests.
m.mu.Lock()
sb.LastActiveAt = time.Now()
m.mu.Unlock()
return sb.HostIP, sb.connTracker, true
}
@ -872,6 +916,146 @@ func (m *Manager) reapExpired(_ context.Context) {
}
}
// StartActivitySampler starts a background goroutine that polls each running
// sandbox's guest liveness (CPU + net/disk IO) and refreshes LastActiveAt when
// the sandbox is doing real work. This is what keeps a long-running but
// non-interactive job (a build, a download) from being auto-paused by the TTL
// reaper, while an idle workload (sleep, a parked shell) still times out.
func (m *Manager) StartActivitySampler(ctx context.Context) {
interval := m.cfg.ActivitySampleInterval
if interval <= 0 {
interval = defaultActivitySampleInterval
}
go func() {
ticker := time.NewTicker(interval)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-m.stopCh:
return
case <-ticker.C:
m.sampleActivity(ctx)
}
}
}()
}
// activityTarget pairs a sandbox ID with the envd client to poll.
type activityTarget struct {
id string
client *envdclient.Client
}
func (m *Manager) sampleActivity(ctx context.Context) {
// Snapshot the running sandboxes and their clients under the lock, then
// poll over the network without holding it.
m.mu.RLock()
targets := make([]activityTarget, 0, len(m.boxes))
for id, sb := range m.boxes {
if sb.Status != models.StatusRunning {
continue
}
// Skip sandboxes still loading memory after a resume — they are not
// settled and their IO/CPU is preload noise, not user work.
if sb.memLoadDone != nil {
select {
case <-sb.memLoadDone:
default:
continue
}
}
c := sb.client.Load()
if c == nil {
continue
}
targets = append(targets, activityTarget{id: id, client: c})
}
m.mu.RUnlock()
if len(targets) == 0 {
return
}
sem := make(chan struct{}, activitySampleConcurrency)
var wg sync.WaitGroup
for _, t := range targets {
wg.Add(1)
sem <- struct{}{}
go func(t activityTarget) {
defer wg.Done()
defer func() { <-sem }()
m.pollAndBump(ctx, t)
}(t)
}
wg.Wait()
}
// pollAndBump fetches one sandbox's activity and refreshes its TTL once it has
// been busy for busyDebounceSamples consecutive samples. Poll failures are
// treated as a non-busy sample: an unreachable envd is handled by the reaper /
// heartbeat paths, and resetting the streak is the safe default.
func (m *Manager) pollAndBump(ctx context.Context, t activityTarget) {
pollCtx, cancel := context.WithTimeout(ctx, activityPollTimeout)
defer cancel()
act, err := t.client.FetchActivity(pollCtx)
busy := err == nil && m.isBusy(act)
m.mu.Lock()
defer m.mu.Unlock()
sb, ok := m.boxes[t.id]
if !ok || sb.Status != models.StatusRunning {
return
}
streak, bump := applyBusySample(sb.activityBusyStreak, busy)
sb.activityBusyStreak = streak
if bump {
sb.LastActiveAt = time.Now()
}
}
// applyBusySample advances a debounce streak with the latest sample and
// reports whether the TTL should be refreshed this tick. A non-busy sample
// resets the streak; the bump fires once the streak reaches the debounce
// threshold and on every busy tick thereafter (the streak is held at the
// threshold rather than growing unbounded).
func applyBusySample(streak int, busy bool) (newStreak int, bump bool) {
if !busy {
return 0, false
}
streak++
if streak >= busyDebounceSamples {
return busyDebounceSamples, true
}
return streak, false
}
// isBusy reports whether a guest liveness snapshot represents real work.
func (m *Manager) isBusy(act *envdclient.Activity) bool {
cpuThreshold := m.cfg.CPUBusyPct
if cpuThreshold <= 0 {
cpuThreshold = defaultCPUBusyPct
}
netFloor := m.cfg.NetFloorBps
if netFloor == 0 {
netFloor = defaultNetFloorBps
}
diskFloor := m.cfg.DiskFloorBps
if diskFloor == 0 {
diskFloor = defaultDiskFloorBps
}
return act.CPUUsedPct >= cpuThreshold ||
act.NetBps >= netFloor ||
act.DiskBps >= diskFloor
}
// Shutdown gracefully drains the manager. Running sandboxes are paused so
// their state survives across agent restarts; any sandboxes still holding
// runtime resources after PauseAll (e.g. paused failed, or status was

View File

@ -110,7 +110,7 @@ func (m *Manager) initAndStartMemoryLoader(ctx context.Context, sb *sandboxState
slog.Warn("post-restore PostInit skipped: envd client cleared", "id", sb.ID)
return
}
if err := c.PostInitWithDefaults(initCtx, defaultUser, envVars, sb.ID, templateIDStr); err != nil {
if err := c.PostInitWithDefaults(initCtx, defaultUser, envVars, sb.ID, templateIDStr, m.cfg.ProxyDomain); err != nil {
slog.Warn("post-restore PostInit failed", "id", sb.ID, "error", err)
}