forked from wrenn/wrenn
v0.2.1 (#55)
Co-authored-by: Tasnim Kabir Sadik <tksadik@omukk.dev> Reviewed-on: wrenn/wrenn#55 Co-authored-by: pptx704 <rafeed@omukk.dev> Co-committed-by: pptx704 <rafeed@omukk.dev>
This commit is contained in:
@ -88,14 +88,47 @@ type Config struct {
|
||||
EnvdTimeout time.Duration
|
||||
DefaultRootfsSizeMB int // target size for template rootfs images; 0 → DefaultDiskSizeMB
|
||||
|
||||
// ProxyDomain is the public domain sandboxes are served under (e.g.
|
||||
// "wrenn.dev"). Injected into envd at /init so `envd ports` can build
|
||||
// {port}-{sandbox_id}.{domain} URLs.
|
||||
ProxyDomain string
|
||||
|
||||
// Resolved at startup by the host agent.
|
||||
KernelPath string // path to the latest vmlinux-x.y.z
|
||||
KernelVersion string // semver extracted from filename
|
||||
VMMBin string // path to the cloud-hypervisor binary
|
||||
VMMVersion string // semver from cloud-hypervisor --version
|
||||
AgentVersion string // host agent version (injected via ldflags)
|
||||
|
||||
// Activity sampler thresholds. The sampler polls each running sandbox's
|
||||
// guest liveness and refreshes its TTL when it is doing real work, so a
|
||||
// long-running but non-interactive job is not mistaken for inactive. A
|
||||
// sandbox counts as busy when guest CPU ≥ CPUBusyPct, or net/disk
|
||||
// throughput ≥ the respective floor (bytes/sec). Zero values fall back to
|
||||
// the package defaults at sampler start.
|
||||
ActivitySampleInterval time.Duration
|
||||
CPUBusyPct float32
|
||||
NetFloorBps uint64
|
||||
DiskFloorBps uint64
|
||||
}
|
||||
|
||||
// Activity sampler defaults. Thresholds sit clear of idle-VM background noise
|
||||
// (envd's own sampler thread, guest timers) so a parked sandbox still times
|
||||
// out; the debounce below guards against a lone noisy sample masquerading as
|
||||
// work. All are env-overridable on the host agent.
|
||||
const (
|
||||
defaultActivitySampleInterval = 5 * time.Second
|
||||
defaultCPUBusyPct = 5.0 // percent of total vCPU capacity
|
||||
defaultNetFloorBps = 16 * 1024 // 16 KB/s
|
||||
defaultDiskFloorBps = 32 * 1024 // 32 KB/s
|
||||
activityPollTimeout = 3 * time.Second
|
||||
activitySampleConcurrency = 16
|
||||
// busyDebounceSamples is how many consecutive busy samples are required
|
||||
// before the sandbox's TTL is refreshed. With a 5s interval, real work
|
||||
// registers within ~10s while isolated noise spikes are ignored.
|
||||
busyDebounceSamples = 2
|
||||
)
|
||||
|
||||
// LifecycleEvent describes an autonomous state change initiated by the agent.
|
||||
type LifecycleEvent struct {
|
||||
Event string
|
||||
@ -189,6 +222,12 @@ type sandboxState struct {
|
||||
ring *metricsRing // tiered ring buffers for CPU/mem/disk metrics
|
||||
samplerCancel context.CancelFunc // cancels the per-sandbox sampling goroutine
|
||||
samplerDone chan struct{} // closed when the sampling goroutine exits
|
||||
|
||||
// activityBusyStreak counts consecutive busy activity samples. A single
|
||||
// noisy sample (idle background CPU, a stray packet) must not refresh the
|
||||
// TTL, so LastActiveAt is only bumped once the streak reaches
|
||||
// busyDebounceSamples. Reset to 0 by any non-busy sample. Guarded by m.mu.
|
||||
activityBusyStreak int
|
||||
}
|
||||
|
||||
// buildMetadata constructs the metadata map with version information.
|
||||
@ -419,14 +458,14 @@ func (m *Manager) Create(
|
||||
// Fetch envd version (best-effort).
|
||||
envdVersion, _ := client.FetchVersion(ctx)
|
||||
|
||||
// Apply template defaults via envd /init (no-op when both empty).
|
||||
if defaultUser != "" || len(defaultEnv) > 0 {
|
||||
initCtx, initCancel := context.WithTimeout(ctx, m.cfg.EnvdTimeout)
|
||||
if err := client.PostInitWithDefaults(initCtx, defaultUser, defaultEnv, sandboxID, id.UUIDString(templateID)); err != nil {
|
||||
slog.Warn("post-create PostInit failed", "id", sandboxID, "error", err)
|
||||
}
|
||||
initCancel()
|
||||
// Apply template defaults + sandbox identity via envd /init. Always called
|
||||
// on create so envd records its sandbox ID and proxy domain (used by
|
||||
// `envd ports`), even when the template specifies no user/env defaults.
|
||||
initCtx, initCancel := context.WithTimeout(ctx, m.cfg.EnvdTimeout)
|
||||
if err := client.PostInitWithDefaults(initCtx, defaultUser, defaultEnv, sandboxID, id.UUIDString(templateID), m.cfg.ProxyDomain); err != nil {
|
||||
slog.Warn("post-create PostInit failed", "id", sandboxID, "error", err)
|
||||
}
|
||||
initCancel()
|
||||
|
||||
now := time.Now()
|
||||
sb := &sandboxState{
|
||||
@ -667,7 +706,7 @@ func (m *Manager) SetDefaults(ctx context.Context, sandboxID, defaultUser string
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return c.PostInitWithDefaults(ctx, defaultUser, defaultEnv, "", "")
|
||||
return c.PostInitWithDefaults(ctx, defaultUser, defaultEnv, "", "", "")
|
||||
}
|
||||
|
||||
// PtyAttach starts a new PTY process or reconnects to an existing one.
|
||||
@ -762,6 +801,11 @@ func (m *Manager) AcquireProxyConn(sandboxID string) (net.IP, *ConnTracker, bool
|
||||
if !sb.connTracker.Acquire() {
|
||||
return nil, nil, false
|
||||
}
|
||||
// Inbound proxy traffic counts as activity: an idle web server reachable
|
||||
// through the proxy should not be auto-paused while it is serving requests.
|
||||
m.mu.Lock()
|
||||
sb.LastActiveAt = time.Now()
|
||||
m.mu.Unlock()
|
||||
return sb.HostIP, sb.connTracker, true
|
||||
}
|
||||
|
||||
@ -872,6 +916,146 @@ func (m *Manager) reapExpired(_ context.Context) {
|
||||
}
|
||||
}
|
||||
|
||||
// StartActivitySampler starts a background goroutine that polls each running
|
||||
// sandbox's guest liveness (CPU + net/disk IO) and refreshes LastActiveAt when
|
||||
// the sandbox is doing real work. This is what keeps a long-running but
|
||||
// non-interactive job (a build, a download) from being auto-paused by the TTL
|
||||
// reaper, while an idle workload (sleep, a parked shell) still times out.
|
||||
func (m *Manager) StartActivitySampler(ctx context.Context) {
|
||||
interval := m.cfg.ActivitySampleInterval
|
||||
if interval <= 0 {
|
||||
interval = defaultActivitySampleInterval
|
||||
}
|
||||
|
||||
go func() {
|
||||
ticker := time.NewTicker(interval)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-m.stopCh:
|
||||
return
|
||||
case <-ticker.C:
|
||||
m.sampleActivity(ctx)
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
// activityTarget pairs a sandbox ID with the envd client to poll.
|
||||
type activityTarget struct {
|
||||
id string
|
||||
client *envdclient.Client
|
||||
}
|
||||
|
||||
func (m *Manager) sampleActivity(ctx context.Context) {
|
||||
// Snapshot the running sandboxes and their clients under the lock, then
|
||||
// poll over the network without holding it.
|
||||
m.mu.RLock()
|
||||
targets := make([]activityTarget, 0, len(m.boxes))
|
||||
for id, sb := range m.boxes {
|
||||
if sb.Status != models.StatusRunning {
|
||||
continue
|
||||
}
|
||||
// Skip sandboxes still loading memory after a resume — they are not
|
||||
// settled and their IO/CPU is preload noise, not user work.
|
||||
if sb.memLoadDone != nil {
|
||||
select {
|
||||
case <-sb.memLoadDone:
|
||||
default:
|
||||
continue
|
||||
}
|
||||
}
|
||||
c := sb.client.Load()
|
||||
if c == nil {
|
||||
continue
|
||||
}
|
||||
targets = append(targets, activityTarget{id: id, client: c})
|
||||
}
|
||||
m.mu.RUnlock()
|
||||
|
||||
if len(targets) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
sem := make(chan struct{}, activitySampleConcurrency)
|
||||
var wg sync.WaitGroup
|
||||
for _, t := range targets {
|
||||
wg.Add(1)
|
||||
sem <- struct{}{}
|
||||
go func(t activityTarget) {
|
||||
defer wg.Done()
|
||||
defer func() { <-sem }()
|
||||
m.pollAndBump(ctx, t)
|
||||
}(t)
|
||||
}
|
||||
wg.Wait()
|
||||
}
|
||||
|
||||
// pollAndBump fetches one sandbox's activity and refreshes its TTL once it has
|
||||
// been busy for busyDebounceSamples consecutive samples. Poll failures are
|
||||
// treated as a non-busy sample: an unreachable envd is handled by the reaper /
|
||||
// heartbeat paths, and resetting the streak is the safe default.
|
||||
func (m *Manager) pollAndBump(ctx context.Context, t activityTarget) {
|
||||
pollCtx, cancel := context.WithTimeout(ctx, activityPollTimeout)
|
||||
defer cancel()
|
||||
|
||||
act, err := t.client.FetchActivity(pollCtx)
|
||||
busy := err == nil && m.isBusy(act)
|
||||
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
|
||||
sb, ok := m.boxes[t.id]
|
||||
if !ok || sb.Status != models.StatusRunning {
|
||||
return
|
||||
}
|
||||
|
||||
streak, bump := applyBusySample(sb.activityBusyStreak, busy)
|
||||
sb.activityBusyStreak = streak
|
||||
if bump {
|
||||
sb.LastActiveAt = time.Now()
|
||||
}
|
||||
}
|
||||
|
||||
// applyBusySample advances a debounce streak with the latest sample and
|
||||
// reports whether the TTL should be refreshed this tick. A non-busy sample
|
||||
// resets the streak; the bump fires once the streak reaches the debounce
|
||||
// threshold and on every busy tick thereafter (the streak is held at the
|
||||
// threshold rather than growing unbounded).
|
||||
func applyBusySample(streak int, busy bool) (newStreak int, bump bool) {
|
||||
if !busy {
|
||||
return 0, false
|
||||
}
|
||||
streak++
|
||||
if streak >= busyDebounceSamples {
|
||||
return busyDebounceSamples, true
|
||||
}
|
||||
return streak, false
|
||||
}
|
||||
|
||||
// isBusy reports whether a guest liveness snapshot represents real work.
|
||||
func (m *Manager) isBusy(act *envdclient.Activity) bool {
|
||||
cpuThreshold := m.cfg.CPUBusyPct
|
||||
if cpuThreshold <= 0 {
|
||||
cpuThreshold = defaultCPUBusyPct
|
||||
}
|
||||
netFloor := m.cfg.NetFloorBps
|
||||
if netFloor == 0 {
|
||||
netFloor = defaultNetFloorBps
|
||||
}
|
||||
diskFloor := m.cfg.DiskFloorBps
|
||||
if diskFloor == 0 {
|
||||
diskFloor = defaultDiskFloorBps
|
||||
}
|
||||
|
||||
return act.CPUUsedPct >= cpuThreshold ||
|
||||
act.NetBps >= netFloor ||
|
||||
act.DiskBps >= diskFloor
|
||||
}
|
||||
|
||||
// Shutdown gracefully drains the manager. Running sandboxes are paused so
|
||||
// their state survives across agent restarts; any sandboxes still holding
|
||||
// runtime resources after PauseAll (e.g. paused failed, or status was
|
||||
|
||||
Reference in New Issue
Block a user