1
0
forked from wrenn/wrenn
Co-authored-by: Tasnim Kabir Sadik <tksadik@omukk.dev>

Reviewed-on: wrenn/wrenn#55
Co-authored-by: pptx704 <rafeed@omukk.dev>
Co-committed-by: pptx704 <rafeed@omukk.dev>
This commit is contained in:
2026-06-20 22:45:08 +00:00
committed by Rafeed M. Bhuiyan
parent cfc0c52010
commit a08e755e53
53 changed files with 1675 additions and 577 deletions

View File

@ -130,22 +130,8 @@ func (h *execStreamHandler) runExecStream(ctx context.Context, conn *websocket.C
// Forward stream events to WebSocket.
for stream.Receive() {
resp := stream.Msg()
switch ev := resp.Event.(type) {
case *pb.ExecStreamResponse_Start:
writeWSJSON(conn, wsOutMsg{Type: "start", PID: ev.Start.Pid})
case *pb.ExecStreamResponse_Data:
switch o := ev.Data.Output.(type) {
case *pb.ExecStreamData_Stdout:
writeWSJSON(conn, wsOutMsg{Type: "stdout", Data: string(o.Stdout)})
case *pb.ExecStreamData_Stderr:
writeWSJSON(conn, wsOutMsg{Type: "stderr", Data: string(o.Stderr)})
}
case *pb.ExecStreamResponse_End:
exitCode := ev.End.ExitCode
writeWSJSON(conn, wsOutMsg{Type: "exit", ExitCode: &exitCode})
if m, ok := procRespToWSMsg(stream.Msg()); ok {
writeWSJSON(conn, m)
}
}
@ -159,6 +145,38 @@ func (h *execStreamHandler) runExecStream(ctx context.Context, conn *websocket.C
updateLastActive(h.db, sandboxID, sandboxIDStr)
}
// procStreamResp is satisfied by both *pb.ExecStreamResponse and
// *pb.ConnectProcessResponse: their oneof events carry the same inner messages,
// so the wire-to-WS mapping below is shared between the exec-stream and
// connect-process handlers.
type procStreamResp interface {
GetStart() *pb.ExecStreamStart
GetData() *pb.ExecStreamData
GetEnd() *pb.ExecStreamEnd
}
// procRespToWSMsg maps one process stream response to the WS message to send.
// The bool is false when the response carries nothing to forward.
func procRespToWSMsg(resp procStreamResp) (wsOutMsg, bool) {
if s := resp.GetStart(); s != nil {
return wsOutMsg{Type: "start", PID: s.Pid}, true
}
if d := resp.GetData(); d != nil {
switch o := d.Output.(type) {
case *pb.ExecStreamData_Stdout:
return wsOutMsg{Type: "stdout", Data: string(o.Stdout)}, true
case *pb.ExecStreamData_Stderr:
return wsOutMsg{Type: "stderr", Data: string(o.Stderr)}, true
}
return wsOutMsg{}, false
}
if e := resp.GetEnd(); e != nil {
exitCode := e.ExitCode
return wsOutMsg{Type: "exit", ExitCode: &exitCode}, true
}
return wsOutMsg{}, false
}
func sendWSError(conn *websocket.Conn, msg string) {
writeWSJSON(conn, wsOutMsg{Type: "error", Data: msg})
}

View File

@ -192,22 +192,8 @@ func (h *processHandler) runConnectProcess(ctx context.Context, conn *websocket.
// Forward stream events to WebSocket.
for stream.Receive() {
resp := stream.Msg()
switch ev := resp.Event.(type) {
case *pb.ConnectProcessResponse_Start:
writeWSJSON(conn, wsOutMsg{Type: "start", PID: ev.Start.Pid})
case *pb.ConnectProcessResponse_Data:
switch o := ev.Data.Output.(type) {
case *pb.ExecStreamData_Stdout:
writeWSJSON(conn, wsOutMsg{Type: "stdout", Data: string(o.Stdout)})
case *pb.ExecStreamData_Stderr:
writeWSJSON(conn, wsOutMsg{Type: "stderr", Data: string(o.Stderr)})
}
case *pb.ConnectProcessResponse_End:
exitCode := ev.End.ExitCode
writeWSJSON(conn, wsOutMsg{Type: "exit", ExitCode: &exitCode})
if m, ok := procRespToWSMsg(stream.Msg()); ok {
writeWSJSON(conn, m)
}
}

View File

@ -60,6 +60,10 @@ func agentErrToHTTP(err error) (int, string, string) {
return http.StatusServiceUnavailable, "no_hosts_available", "no servers available — try again later"
case connect.CodeUnimplemented:
return http.StatusNotImplemented, "agent_error", err.Error()
case connect.CodeDeadlineExceeded:
return http.StatusGatewayTimeout, "timeout", "command timed out"
case connect.CodeInternal:
return http.StatusInternalServerError, "agent_error", err.Error()
default:
return http.StatusBadGateway, "agent_error", err.Error()
}

View File

@ -144,7 +144,7 @@ func (c *SandboxEventConsumer) handleMessage(ctx context.Context, msg redis.XMes
}
case events.CapsulePause:
if event.Outcome == events.OutcomeSuccess {
c.handleAutoPaused(ctx, sandboxID)
c.handleAutoPaused(ctx, sandboxID, event)
}
case events.CapsuleDestroy:
if event.Outcome == events.OutcomeSuccess {
@ -226,12 +226,35 @@ func (c *SandboxEventConsumer) handleStarted(ctx context.Context, sandboxID pgty
}
}
func (c *SandboxEventConsumer) handleAutoPaused(ctx context.Context, sandboxID pgtype.UUID) {
// handleAutoPaused reflects an autonomous (TTL reaper / shutdown) pause in the
// DB and writes the audit row for it. The audit write happens only when the
// status flip actually applied, so a stream redelivery does not double-count,
// and so the HostMonitor host_state_sync fallback (which audits the
// callback-lost case) stays mutually exclusive with this path.
//
// Uses audit.Log (row only) — NOT LogSandboxAutoPause, which republishes a
// CapsulePause/system event that would loop straight back into this consumer.
func (c *SandboxEventConsumer) handleAutoPaused(ctx context.Context, sandboxID pgtype.UUID, event events.Event) {
for _, fromStatus := range []string{"running", "pausing"} {
if _, err := c.db.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{
ID: sandboxID, Status: fromStatus, Status_2: "paused",
}); err == nil {
slog.Debug("sandbox event consumer: auto-paused fallback applied", "sandbox_id", id.FormatSandboxID(sandboxID), "from", fromStatus)
slog.Debug("sandbox event consumer: auto-paused applied", "sandbox_id", id.FormatSandboxID(sandboxID), "from", fromStatus)
reason := event.Metadata["reason"]
if reason == "" {
reason = "ttl_expired"
}
teamID, _ := id.ParseTeamID(event.TeamID)
c.audit.Log(ctx, audit.Entry{
TeamID: teamID,
ActorType: "system",
ResourceType: "sandbox",
ResourceID: id.FormatSandboxID(sandboxID),
Action: "pause",
Scope: "team",
Status: "info",
Metadata: map[string]any{"reason": reason},
})
return
}
}

View File

@ -104,6 +104,14 @@ func (r *SSERelay) handleMessage(ctx context.Context, msg *redis.Message) {
if err != nil {
slog.Debug("sse relay: sandbox hydration failed (may be deleted)", "sandbox_id", event.Resource.ID, "error", err)
} else {
// Override the hydrated status with the status implied by the event
// verb. Autonomous transitions (e.g. TTL auto-pause) flip the DB row
// in a separate stream consumer that races this Pub/Sub read, so the
// hydrated row may still carry the pre-transition status. The event
// itself is authoritative for the resulting state.
if status, ok := impliedSandboxStatus(event); ok {
sb.Status = status
}
payload.Sandbox = sb
}
}
@ -138,6 +146,25 @@ func (r *SSERelay) hydrateSandbox(ctx context.Context, sandboxIDStr string) (*sa
return &resp, nil
}
// impliedSandboxStatus maps a successful capsule lifecycle event to the
// sandbox status it results in. Used to override a hydrated DB row that may
// still carry the pre-transition status because the reconciliation consumer
// that flips it races this Pub/Sub read. Returns false for events with no
// single deterministic resulting status (failures, destroy, state_changed).
func impliedSandboxStatus(event events.Event) (string, bool) {
if event.Outcome != events.OutcomeSuccess {
return "", false
}
switch event.Event {
case events.CapsulePause:
return "paused", true
case events.CapsuleResume, events.CapsuleCreate:
return "running", true
default:
return "", false
}
}
func isCapsuleEvent(eventType string) bool {
switch eventType {
case events.CapsuleCreate, events.CapsulePause, events.CapsuleResume, events.CapsuleDestroy, events.CapsuleStateChanged:

View File

@ -25,6 +25,7 @@ type Client struct {
hostIP string
base string
healthURL string
activityURL string
httpClient *http.Client
streamingClient *http.Client
@ -42,6 +43,7 @@ func New(hostIP string) *Client {
hostIP: hostIP,
base: base,
healthURL: base + "/health",
activityURL: base + "/activity",
httpClient: httpClient,
streamingClient: streamingClient,
process: genconnect.NewProcessClient(streamingClient, base),
@ -117,36 +119,17 @@ func (c *Client) Exec(ctx context.Context, cmd string, args []string, opts *Exec
result := &ExecResult{}
for stream.Receive() {
msg := stream.Msg()
if msg.Event == nil {
ev, ok := procEventToStreamEvent(stream.Msg().GetEvent())
if !ok {
continue
}
event := msg.Event.GetEvent()
switch e := event.(type) {
case *envdpb.ProcessEvent_Start:
slog.Debug("process started", "pid", e.Start.GetPid())
case *envdpb.ProcessEvent_Data:
output := e.Data.GetOutput()
switch o := output.(type) {
case *envdpb.ProcessEvent_DataEvent_Stdout:
result.Stdout = append(result.Stdout, o.Stdout...)
case *envdpb.ProcessEvent_DataEvent_Stderr:
result.Stderr = append(result.Stderr, o.Stderr...)
}
case *envdpb.ProcessEvent_End:
result.ExitCode = e.End.GetExitCode()
if e.End.Error != nil {
slog.Debug("process ended with error",
"exit_code", e.End.GetExitCode(),
"error", e.End.GetError(),
)
}
case *envdpb.ProcessEvent_Keepalive:
// Ignore keepalives.
switch ev.Type {
case "stdout":
result.Stdout = append(result.Stdout, ev.Data...)
case "stderr":
result.Stderr = append(result.Stderr, ev.Data...)
case "end":
result.ExitCode = ev.ExitCode
}
}
@ -166,6 +149,76 @@ type ExecStreamEvent struct {
Error string
}
// procEventToStreamEvent converts a raw envd ProcessEvent into an
// ExecStreamEvent. The second return is false for events with no payload to
// forward (nil event, keepalive, unknown data variant) so callers can skip
// them. This is the single decoder shared by Exec, ExecStream and
// ConnectProcess.
func procEventToStreamEvent(pe *envdpb.ProcessEvent) (ExecStreamEvent, bool) {
if pe == nil {
return ExecStreamEvent{}, false
}
switch e := pe.GetEvent().(type) {
case *envdpb.ProcessEvent_Start:
return ExecStreamEvent{Type: "start", PID: e.Start.GetPid()}, true
case *envdpb.ProcessEvent_Data:
switch o := e.Data.GetOutput().(type) {
case *envdpb.ProcessEvent_DataEvent_Stdout:
return ExecStreamEvent{Type: "stdout", Data: o.Stdout}, true
case *envdpb.ProcessEvent_DataEvent_Stderr:
return ExecStreamEvent{Type: "stderr", Data: o.Stderr}, true
}
return ExecStreamEvent{}, false
case *envdpb.ProcessEvent_End:
ev := ExecStreamEvent{Type: "end", ExitCode: e.End.GetExitCode()}
if e.End.Error != nil {
ev.Error = e.End.GetError()
}
return ev, true
}
return ExecStreamEvent{}, false
}
// procEventStream is the subset of a Connect server-stream that pumpProcessEvents
// needs. Both *connect.ServerStreamForClient[StartResponse] and
// [ConnectResponse] satisfy it.
type procEventStream[T any] interface {
Receive() bool
Msg() *T
Err() error
Close() error
}
// pumpProcessEvents drains a process server-stream into ch until the stream ends
// or ctx is cancelled, closing ch on exit. getEvent extracts the ProcessEvent
// from each message so the same loop works for both the Start and Connect RPCs.
func pumpProcessEvents[T any](
ctx context.Context,
stream procEventStream[T],
getEvent func(*T) *envdpb.ProcessEvent,
ch chan<- ExecStreamEvent,
logLabel string,
) {
defer close(ch)
defer stream.Close()
for stream.Receive() {
ev, ok := procEventToStreamEvent(getEvent(stream.Msg()))
if !ok {
continue
}
select {
case ch <- ev:
case <-ctx.Done():
return
}
}
if err := stream.Err(); err != nil && err != io.EOF {
slog.Debug(logLabel, "error", err)
}
}
// ExecStream runs a command inside the sandbox and returns a channel of output events.
// The channel is closed when the process ends or the context is cancelled.
func (c *Client) ExecStream(ctx context.Context, cmd string, args ...string) (<-chan ExecStreamEvent, error) {
@ -184,52 +237,7 @@ func (c *Client) ExecStream(ctx context.Context, cmd string, args ...string) (<-
}
ch := make(chan ExecStreamEvent, 256)
go func() {
defer close(ch)
defer stream.Close()
for stream.Receive() {
msg := stream.Msg()
if msg.Event == nil {
continue
}
var ev ExecStreamEvent
event := msg.Event.GetEvent()
switch e := event.(type) {
case *envdpb.ProcessEvent_Start:
ev = ExecStreamEvent{Type: "start", PID: e.Start.GetPid()}
case *envdpb.ProcessEvent_Data:
output := e.Data.GetOutput()
switch o := output.(type) {
case *envdpb.ProcessEvent_DataEvent_Stdout:
ev = ExecStreamEvent{Type: "stdout", Data: o.Stdout}
case *envdpb.ProcessEvent_DataEvent_Stderr:
ev = ExecStreamEvent{Type: "stderr", Data: o.Stderr}
}
case *envdpb.ProcessEvent_End:
ev = ExecStreamEvent{Type: "end", ExitCode: e.End.GetExitCode()}
if e.End.Error != nil {
ev.Error = e.End.GetError()
}
case *envdpb.ProcessEvent_Keepalive:
continue
}
select {
case ch <- ev:
case <-ctx.Done():
return
}
}
if err := stream.Err(); err != nil && err != io.EOF {
slog.Debug("exec stream error", "error", err)
}
}()
go pumpProcessEvents(ctx, stream, (*envdpb.StartResponse).GetEvent, ch, "exec stream error")
return ch, nil
}
@ -434,7 +442,7 @@ func (c *Client) CancelMemoryPreload(ctx context.Context) error {
// post-restore initialization. sandbox_id and template_id are passed
// so envd can set WRENN_SANDBOX_ID and WRENN_TEMPLATE_ID env vars.
func (c *Client) PostInit(ctx context.Context) error {
return c.PostInitWithDefaults(ctx, "", nil, "", "")
return c.PostInitWithDefaults(ctx, "", nil, "", "", "")
}
// PostInitWithDefaults calls envd's POST /init endpoint with optional default
@ -444,7 +452,7 @@ func (c *Client) PostInit(ctx context.Context) error {
// timestamp and lifecycle_id are always populated: envd uses them to snap
// the guest clock to the host's wall time and to detect post-resume calls
// (which trigger port-forwarder restart + NFS remount).
func (c *Client) PostInitWithDefaults(ctx context.Context, defaultUser string, envVars map[string]string, sandboxID, templateID string) error {
func (c *Client) PostInitWithDefaults(ctx context.Context, defaultUser string, envVars map[string]string, sandboxID, templateID, proxyDomain string) error {
payload := map[string]any{
"timestamp": time.Now().UTC().Format(time.RFC3339Nano),
"lifecycle_id": uuid.NewString(),
@ -461,6 +469,9 @@ func (c *Client) PostInitWithDefaults(ctx context.Context, defaultUser string, e
if templateID != "" {
payload["template_id"] = templateID
}
if proxyDomain != "" {
payload["proxy_domain"] = proxyDomain
}
var body io.Reader
if len(payload) > 0 {

View File

@ -81,6 +81,42 @@ func (c *Client) WaitUntilRPCReady(ctx context.Context) error {
}
}
// Activity is envd's liveness snapshot: VM-wide CPU utilisation and IO
// throughput sampled inside the guest. The host activity sampler uses it to
// decide whether a sandbox is doing real work and should keep its TTL fresh.
type Activity struct {
CPUCount uint32 `json:"cpu_count"`
CPUUsedPct float32 `json:"cpu_used_pct"`
NetBps uint64 `json:"net_bps"`
DiskBps uint64 `json:"disk_bps"`
}
// FetchActivity polls envd's /activity endpoint. The endpoint serves straight
// from in-guest atomics (no syscalls), so it is cheap to call frequently.
func (c *Client) FetchActivity(ctx context.Context) (*Activity, error) {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, c.activityURL, nil)
if err != nil {
return nil, fmt.Errorf("build activity request: %w", err)
}
resp, err := c.httpClient.Do(req)
if err != nil {
return nil, fmt.Errorf("fetch envd activity: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("activity check returned %d", resp.StatusCode)
}
var data Activity
if err := json.NewDecoder(resp.Body).Decode(&data); err != nil {
return nil, fmt.Errorf("decode activity response: %w", err)
}
return &data, nil
}
// healthCheck sends a single GET /health request to envd.
func (c *Client) healthCheck(ctx context.Context) error {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, c.healthURL, nil)

View File

@ -4,7 +4,6 @@ import (
"context"
"fmt"
"io"
"log/slog"
"connectrpc.com/connect"
@ -87,52 +86,7 @@ func (c *Client) ConnectProcess(ctx context.Context, pid uint32, tag string) (<-
}
ch := make(chan ExecStreamEvent, 16)
go func() {
defer close(ch)
defer stream.Close()
for stream.Receive() {
msg := stream.Msg()
if msg.Event == nil {
continue
}
var ev ExecStreamEvent
switch e := msg.Event.GetEvent().(type) {
case *envdpb.ProcessEvent_Start:
ev = ExecStreamEvent{Type: "start", PID: e.Start.GetPid()}
case *envdpb.ProcessEvent_Data:
switch o := e.Data.GetOutput().(type) {
case *envdpb.ProcessEvent_DataEvent_Stdout:
ev = ExecStreamEvent{Type: "stdout", Data: o.Stdout}
case *envdpb.ProcessEvent_DataEvent_Stderr:
ev = ExecStreamEvent{Type: "stderr", Data: o.Stderr}
default:
continue
}
case *envdpb.ProcessEvent_End:
ev = ExecStreamEvent{Type: "end", ExitCode: e.End.GetExitCode()}
if e.End.Error != nil {
ev.Error = e.End.GetError()
}
case *envdpb.ProcessEvent_Keepalive:
continue
}
select {
case ch <- ev:
case <-ctx.Done():
return
}
}
if err := stream.Err(); err != nil && err != io.EOF {
slog.Debug("connect process stream error", "error", err)
}
}()
go pumpProcessEvents(ctx, stream, (*envdpb.ConnectResponse).GetEvent, ch, "connect process stream error")
return ch, nil
}

View File

@ -253,7 +253,7 @@ func (s *Server) Exec(
result, err := s.mgr.Exec(execCtx, msg.SandboxId, msg.Cmd, msg.Args, opts)
if err != nil {
return nil, connect.NewError(connect.CodeInternal, fmt.Errorf("exec: %w", err))
return nil, envdErr("exec", err)
}
return connect.NewResponse(&pb.ExecResponse{
@ -395,31 +395,15 @@ func (s *Server) ExecStream(
}
for ev := range events {
start, data, end := execEventParts(ev)
var resp pb.ExecStreamResponse
switch ev.Type {
case "start":
resp.Event = &pb.ExecStreamResponse_Start{
Start: &pb.ExecStreamStart{Pid: ev.PID},
}
case "stdout":
resp.Event = &pb.ExecStreamResponse_Data{
Data: &pb.ExecStreamData{
Output: &pb.ExecStreamData_Stdout{Stdout: ev.Data},
},
}
case "stderr":
resp.Event = &pb.ExecStreamResponse_Data{
Data: &pb.ExecStreamData{
Output: &pb.ExecStreamData_Stderr{Stderr: ev.Data},
},
}
case "end":
resp.Event = &pb.ExecStreamResponse_End{
End: &pb.ExecStreamEnd{
ExitCode: ev.ExitCode,
Error: ev.Error,
},
}
switch {
case start != nil:
resp.Event = &pb.ExecStreamResponse_Start{Start: start}
case data != nil:
resp.Event = &pb.ExecStreamResponse_Data{Data: data}
case end != nil:
resp.Event = &pb.ExecStreamResponse_End{End: end}
default:
continue
}
@ -431,6 +415,24 @@ func (s *Server) ExecStream(
return nil
}
// execEventParts maps a streaming exec event to its proto inner message.
// Exactly one return value is non-nil; all-nil means the event carries nothing
// to forward. Shared by ExecStream and ConnectProcess, which differ only in the
// response envelope wrapping these inner messages.
func execEventParts(ev envdclient.ExecStreamEvent) (*pb.ExecStreamStart, *pb.ExecStreamData, *pb.ExecStreamEnd) {
switch ev.Type {
case "start":
return &pb.ExecStreamStart{Pid: ev.PID}, nil, nil
case "stdout":
return nil, &pb.ExecStreamData{Output: &pb.ExecStreamData_Stdout{Stdout: ev.Data}}, nil
case "stderr":
return nil, &pb.ExecStreamData{Output: &pb.ExecStreamData_Stderr{Stderr: ev.Data}}, nil
case "end":
return nil, nil, &pb.ExecStreamEnd{ExitCode: ev.ExitCode, Error: ev.Error}
}
return nil, nil, nil
}
func (s *Server) WriteFileStream(
ctx context.Context,
stream *connect.ClientStream[pb.WriteFileStreamRequest],
@ -912,31 +914,15 @@ func (s *Server) ConnectProcess(
}
for ev := range events {
start, data, end := execEventParts(ev)
var resp pb.ConnectProcessResponse
switch ev.Type {
case "start":
resp.Event = &pb.ConnectProcessResponse_Start{
Start: &pb.ExecStreamStart{Pid: ev.PID},
}
case "stdout":
resp.Event = &pb.ConnectProcessResponse_Data{
Data: &pb.ExecStreamData{
Output: &pb.ExecStreamData_Stdout{Stdout: ev.Data},
},
}
case "stderr":
resp.Event = &pb.ConnectProcessResponse_Data{
Data: &pb.ExecStreamData{
Output: &pb.ExecStreamData_Stderr{Stderr: ev.Data},
},
}
case "end":
resp.Event = &pb.ConnectProcessResponse_End{
End: &pb.ExecStreamEnd{
ExitCode: ev.ExitCode,
Error: ev.Error,
},
}
switch {
case start != nil:
resp.Event = &pb.ConnectProcessResponse_Start{Start: start}
case data != nil:
resp.Event = &pb.ConnectProcessResponse_Data{Data: data}
case end != nil:
resp.Event = &pb.ConnectProcessResponse_End{End: end}
default:
continue
}

View File

@ -0,0 +1,111 @@
package sandbox
import (
"testing"
"git.omukk.dev/wrenn/wrenn/internal/envdclient"
)
func TestIsBusy(t *testing.T) {
tests := []struct {
name string
cfg Config
act envdclient.Activity
want bool
}{
// Default thresholds (zero cfg → defaults: cpu 5%, net 16K, disk 32K).
{"idle", Config{}, envdclient.Activity{CPUUsedPct: 0.5, NetBps: 100, DiskBps: 200}, false},
{"cpu just below", Config{}, envdclient.Activity{CPUUsedPct: 4.99}, false},
{"cpu at threshold", Config{}, envdclient.Activity{CPUUsedPct: 5.0}, true},
{"cpu above", Config{}, envdclient.Activity{CPUUsedPct: 80.0}, true},
{"net just below", Config{}, envdclient.Activity{NetBps: 16*1024 - 1}, false},
{"net at floor", Config{}, envdclient.Activity{NetBps: 16 * 1024}, true},
{"disk just below", Config{}, envdclient.Activity{DiskBps: 32*1024 - 1}, false},
{"disk at floor", Config{}, envdclient.Activity{DiskBps: 32 * 1024}, true},
{"download: low cpu, high net", Config{}, envdclient.Activity{CPUUsedPct: 1.0, NetBps: 5 * 1024 * 1024}, true},
// Explicit overrides take precedence over defaults.
{
"custom cpu threshold met",
Config{CPUBusyPct: 20.0},
envdclient.Activity{CPUUsedPct: 25.0},
true,
},
{
"custom cpu threshold not met",
Config{CPUBusyPct: 20.0},
envdclient.Activity{CPUUsedPct: 10.0},
false,
},
{
"custom net floor not met",
Config{NetFloorBps: 1024 * 1024},
envdclient.Activity{NetBps: 16 * 1024},
false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
m := &Manager{cfg: tt.cfg}
if got := m.isBusy(&tt.act); got != tt.want {
t.Errorf("isBusy(%+v) = %v, want %v", tt.act, got, tt.want)
}
})
}
}
func TestApplyBusySample(t *testing.T) {
// Debounce requires busyDebounceSamples consecutive busy samples before the
// first bump. Verify the streak math and bump timing.
if busyDebounceSamples != 2 {
t.Skip("test written for busyDebounceSamples=2")
}
tests := []struct {
name string
startStreak int
busy bool
wantStreak int
wantBump bool
}{
{"first busy, no bump yet", 0, true, 1, false},
{"second consecutive busy, bump", 1, true, 2, true},
{"sustained busy keeps bumping, streak held", 2, true, 2, true},
{"single noise spike from idle, no bump", 0, false, 0, false},
{"idle resets a building streak", 1, false, 0, false},
{"idle resets a saturated streak", 2, false, 0, false},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
gotStreak, gotBump := applyBusySample(tt.startStreak, tt.busy)
if gotStreak != tt.wantStreak || gotBump != tt.wantBump {
t.Errorf("applyBusySample(%d, %v) = (%d, %v), want (%d, %v)",
tt.startStreak, tt.busy, gotStreak, gotBump, tt.wantStreak, tt.wantBump)
}
})
}
}
// TestApplyBusySample_NoiseScenario walks a realistic sample sequence: brief
// noise never crosses the debounce, but sustained work does and then a return
// to idle resets — proving an isolated spike cannot keep a sandbox alive.
func TestApplyBusySample_NoiseScenario(t *testing.T) {
if busyDebounceSamples != 2 {
t.Skip("test written for busyDebounceSamples=2")
}
samples := []bool{true, false, false, true, true, true, false}
wantBumps := []bool{false, false, false, false, true, true, false}
streak := 0
for i, busy := range samples {
var bump bool
streak, bump = applyBusySample(streak, busy)
if bump != wantBumps[i] {
t.Errorf("sample %d (busy=%v): bump = %v, want %v (streak=%d)",
i, busy, bump, wantBumps[i], streak)
}
}
}

View File

@ -88,14 +88,47 @@ type Config struct {
EnvdTimeout time.Duration
DefaultRootfsSizeMB int // target size for template rootfs images; 0 → DefaultDiskSizeMB
// ProxyDomain is the public domain sandboxes are served under (e.g.
// "wrenn.dev"). Injected into envd at /init so `envd ports` can build
// {port}-{sandbox_id}.{domain} URLs.
ProxyDomain string
// Resolved at startup by the host agent.
KernelPath string // path to the latest vmlinux-x.y.z
KernelVersion string // semver extracted from filename
VMMBin string // path to the cloud-hypervisor binary
VMMVersion string // semver from cloud-hypervisor --version
AgentVersion string // host agent version (injected via ldflags)
// Activity sampler thresholds. The sampler polls each running sandbox's
// guest liveness and refreshes its TTL when it is doing real work, so a
// long-running but non-interactive job is not mistaken for inactive. A
// sandbox counts as busy when guest CPU ≥ CPUBusyPct, or net/disk
// throughput ≥ the respective floor (bytes/sec). Zero values fall back to
// the package defaults at sampler start.
ActivitySampleInterval time.Duration
CPUBusyPct float32
NetFloorBps uint64
DiskFloorBps uint64
}
// Activity sampler defaults. Thresholds sit clear of idle-VM background noise
// (envd's own sampler thread, guest timers) so a parked sandbox still times
// out; the debounce below guards against a lone noisy sample masquerading as
// work. All are env-overridable on the host agent.
const (
defaultActivitySampleInterval = 5 * time.Second
defaultCPUBusyPct = 5.0 // percent of total vCPU capacity
defaultNetFloorBps = 16 * 1024 // 16 KB/s
defaultDiskFloorBps = 32 * 1024 // 32 KB/s
activityPollTimeout = 3 * time.Second
activitySampleConcurrency = 16
// busyDebounceSamples is how many consecutive busy samples are required
// before the sandbox's TTL is refreshed. With a 5s interval, real work
// registers within ~10s while isolated noise spikes are ignored.
busyDebounceSamples = 2
)
// LifecycleEvent describes an autonomous state change initiated by the agent.
type LifecycleEvent struct {
Event string
@ -189,6 +222,12 @@ type sandboxState struct {
ring *metricsRing // tiered ring buffers for CPU/mem/disk metrics
samplerCancel context.CancelFunc // cancels the per-sandbox sampling goroutine
samplerDone chan struct{} // closed when the sampling goroutine exits
// activityBusyStreak counts consecutive busy activity samples. A single
// noisy sample (idle background CPU, a stray packet) must not refresh the
// TTL, so LastActiveAt is only bumped once the streak reaches
// busyDebounceSamples. Reset to 0 by any non-busy sample. Guarded by m.mu.
activityBusyStreak int
}
// buildMetadata constructs the metadata map with version information.
@ -419,14 +458,14 @@ func (m *Manager) Create(
// Fetch envd version (best-effort).
envdVersion, _ := client.FetchVersion(ctx)
// Apply template defaults via envd /init (no-op when both empty).
if defaultUser != "" || len(defaultEnv) > 0 {
initCtx, initCancel := context.WithTimeout(ctx, m.cfg.EnvdTimeout)
if err := client.PostInitWithDefaults(initCtx, defaultUser, defaultEnv, sandboxID, id.UUIDString(templateID)); err != nil {
slog.Warn("post-create PostInit failed", "id", sandboxID, "error", err)
}
initCancel()
// Apply template defaults + sandbox identity via envd /init. Always called
// on create so envd records its sandbox ID and proxy domain (used by
// `envd ports`), even when the template specifies no user/env defaults.
initCtx, initCancel := context.WithTimeout(ctx, m.cfg.EnvdTimeout)
if err := client.PostInitWithDefaults(initCtx, defaultUser, defaultEnv, sandboxID, id.UUIDString(templateID), m.cfg.ProxyDomain); err != nil {
slog.Warn("post-create PostInit failed", "id", sandboxID, "error", err)
}
initCancel()
now := time.Now()
sb := &sandboxState{
@ -667,7 +706,7 @@ func (m *Manager) SetDefaults(ctx context.Context, sandboxID, defaultUser string
if err != nil {
return err
}
return c.PostInitWithDefaults(ctx, defaultUser, defaultEnv, "", "")
return c.PostInitWithDefaults(ctx, defaultUser, defaultEnv, "", "", "")
}
// PtyAttach starts a new PTY process or reconnects to an existing one.
@ -762,6 +801,11 @@ func (m *Manager) AcquireProxyConn(sandboxID string) (net.IP, *ConnTracker, bool
if !sb.connTracker.Acquire() {
return nil, nil, false
}
// Inbound proxy traffic counts as activity: an idle web server reachable
// through the proxy should not be auto-paused while it is serving requests.
m.mu.Lock()
sb.LastActiveAt = time.Now()
m.mu.Unlock()
return sb.HostIP, sb.connTracker, true
}
@ -872,6 +916,146 @@ func (m *Manager) reapExpired(_ context.Context) {
}
}
// StartActivitySampler starts a background goroutine that polls each running
// sandbox's guest liveness (CPU + net/disk IO) and refreshes LastActiveAt when
// the sandbox is doing real work. This is what keeps a long-running but
// non-interactive job (a build, a download) from being auto-paused by the TTL
// reaper, while an idle workload (sleep, a parked shell) still times out.
func (m *Manager) StartActivitySampler(ctx context.Context) {
interval := m.cfg.ActivitySampleInterval
if interval <= 0 {
interval = defaultActivitySampleInterval
}
go func() {
ticker := time.NewTicker(interval)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-m.stopCh:
return
case <-ticker.C:
m.sampleActivity(ctx)
}
}
}()
}
// activityTarget pairs a sandbox ID with the envd client to poll.
type activityTarget struct {
id string
client *envdclient.Client
}
func (m *Manager) sampleActivity(ctx context.Context) {
// Snapshot the running sandboxes and their clients under the lock, then
// poll over the network without holding it.
m.mu.RLock()
targets := make([]activityTarget, 0, len(m.boxes))
for id, sb := range m.boxes {
if sb.Status != models.StatusRunning {
continue
}
// Skip sandboxes still loading memory after a resume — they are not
// settled and their IO/CPU is preload noise, not user work.
if sb.memLoadDone != nil {
select {
case <-sb.memLoadDone:
default:
continue
}
}
c := sb.client.Load()
if c == nil {
continue
}
targets = append(targets, activityTarget{id: id, client: c})
}
m.mu.RUnlock()
if len(targets) == 0 {
return
}
sem := make(chan struct{}, activitySampleConcurrency)
var wg sync.WaitGroup
for _, t := range targets {
wg.Add(1)
sem <- struct{}{}
go func(t activityTarget) {
defer wg.Done()
defer func() { <-sem }()
m.pollAndBump(ctx, t)
}(t)
}
wg.Wait()
}
// pollAndBump fetches one sandbox's activity and refreshes its TTL once it has
// been busy for busyDebounceSamples consecutive samples. Poll failures are
// treated as a non-busy sample: an unreachable envd is handled by the reaper /
// heartbeat paths, and resetting the streak is the safe default.
func (m *Manager) pollAndBump(ctx context.Context, t activityTarget) {
pollCtx, cancel := context.WithTimeout(ctx, activityPollTimeout)
defer cancel()
act, err := t.client.FetchActivity(pollCtx)
busy := err == nil && m.isBusy(act)
m.mu.Lock()
defer m.mu.Unlock()
sb, ok := m.boxes[t.id]
if !ok || sb.Status != models.StatusRunning {
return
}
streak, bump := applyBusySample(sb.activityBusyStreak, busy)
sb.activityBusyStreak = streak
if bump {
sb.LastActiveAt = time.Now()
}
}
// applyBusySample advances a debounce streak with the latest sample and
// reports whether the TTL should be refreshed this tick. A non-busy sample
// resets the streak; the bump fires once the streak reaches the debounce
// threshold and on every busy tick thereafter (the streak is held at the
// threshold rather than growing unbounded).
func applyBusySample(streak int, busy bool) (newStreak int, bump bool) {
if !busy {
return 0, false
}
streak++
if streak >= busyDebounceSamples {
return busyDebounceSamples, true
}
return streak, false
}
// isBusy reports whether a guest liveness snapshot represents real work.
func (m *Manager) isBusy(act *envdclient.Activity) bool {
cpuThreshold := m.cfg.CPUBusyPct
if cpuThreshold <= 0 {
cpuThreshold = defaultCPUBusyPct
}
netFloor := m.cfg.NetFloorBps
if netFloor == 0 {
netFloor = defaultNetFloorBps
}
diskFloor := m.cfg.DiskFloorBps
if diskFloor == 0 {
diskFloor = defaultDiskFloorBps
}
return act.CPUUsedPct >= cpuThreshold ||
act.NetBps >= netFloor ||
act.DiskBps >= diskFloor
}
// Shutdown gracefully drains the manager. Running sandboxes are paused so
// their state survives across agent restarts; any sandboxes still holding
// runtime resources after PauseAll (e.g. paused failed, or status was

View File

@ -110,7 +110,7 @@ func (m *Manager) initAndStartMemoryLoader(ctx context.Context, sb *sandboxState
slog.Warn("post-restore PostInit skipped: envd client cleared", "id", sb.ID)
return
}
if err := c.PostInitWithDefaults(initCtx, defaultUser, envVars, sb.ID, templateIDStr); err != nil {
if err := c.PostInitWithDefaults(initCtx, defaultUser, envVars, sb.ID, templateIDStr, m.cfg.ProxyDomain); err != nil {
slog.Warn("post-restore PostInit failed", "id", sb.ID, "error", err)
}