forked from wrenn/wrenn
v0.2.1 (#55)
Co-authored-by: Tasnim Kabir Sadik <tksadik@omukk.dev> Reviewed-on: wrenn/wrenn#55 Co-authored-by: pptx704 <rafeed@omukk.dev> Co-committed-by: pptx704 <rafeed@omukk.dev>
This commit is contained in:
@ -130,22 +130,8 @@ func (h *execStreamHandler) runExecStream(ctx context.Context, conn *websocket.C
|
||||
|
||||
// Forward stream events to WebSocket.
|
||||
for stream.Receive() {
|
||||
resp := stream.Msg()
|
||||
switch ev := resp.Event.(type) {
|
||||
case *pb.ExecStreamResponse_Start:
|
||||
writeWSJSON(conn, wsOutMsg{Type: "start", PID: ev.Start.Pid})
|
||||
|
||||
case *pb.ExecStreamResponse_Data:
|
||||
switch o := ev.Data.Output.(type) {
|
||||
case *pb.ExecStreamData_Stdout:
|
||||
writeWSJSON(conn, wsOutMsg{Type: "stdout", Data: string(o.Stdout)})
|
||||
case *pb.ExecStreamData_Stderr:
|
||||
writeWSJSON(conn, wsOutMsg{Type: "stderr", Data: string(o.Stderr)})
|
||||
}
|
||||
|
||||
case *pb.ExecStreamResponse_End:
|
||||
exitCode := ev.End.ExitCode
|
||||
writeWSJSON(conn, wsOutMsg{Type: "exit", ExitCode: &exitCode})
|
||||
if m, ok := procRespToWSMsg(stream.Msg()); ok {
|
||||
writeWSJSON(conn, m)
|
||||
}
|
||||
}
|
||||
|
||||
@ -159,6 +145,38 @@ func (h *execStreamHandler) runExecStream(ctx context.Context, conn *websocket.C
|
||||
updateLastActive(h.db, sandboxID, sandboxIDStr)
|
||||
}
|
||||
|
||||
// procStreamResp is satisfied by both *pb.ExecStreamResponse and
|
||||
// *pb.ConnectProcessResponse: their oneof events carry the same inner messages,
|
||||
// so the wire-to-WS mapping below is shared between the exec-stream and
|
||||
// connect-process handlers.
|
||||
type procStreamResp interface {
|
||||
GetStart() *pb.ExecStreamStart
|
||||
GetData() *pb.ExecStreamData
|
||||
GetEnd() *pb.ExecStreamEnd
|
||||
}
|
||||
|
||||
// procRespToWSMsg maps one process stream response to the WS message to send.
|
||||
// The bool is false when the response carries nothing to forward.
|
||||
func procRespToWSMsg(resp procStreamResp) (wsOutMsg, bool) {
|
||||
if s := resp.GetStart(); s != nil {
|
||||
return wsOutMsg{Type: "start", PID: s.Pid}, true
|
||||
}
|
||||
if d := resp.GetData(); d != nil {
|
||||
switch o := d.Output.(type) {
|
||||
case *pb.ExecStreamData_Stdout:
|
||||
return wsOutMsg{Type: "stdout", Data: string(o.Stdout)}, true
|
||||
case *pb.ExecStreamData_Stderr:
|
||||
return wsOutMsg{Type: "stderr", Data: string(o.Stderr)}, true
|
||||
}
|
||||
return wsOutMsg{}, false
|
||||
}
|
||||
if e := resp.GetEnd(); e != nil {
|
||||
exitCode := e.ExitCode
|
||||
return wsOutMsg{Type: "exit", ExitCode: &exitCode}, true
|
||||
}
|
||||
return wsOutMsg{}, false
|
||||
}
|
||||
|
||||
func sendWSError(conn *websocket.Conn, msg string) {
|
||||
writeWSJSON(conn, wsOutMsg{Type: "error", Data: msg})
|
||||
}
|
||||
|
||||
@ -192,22 +192,8 @@ func (h *processHandler) runConnectProcess(ctx context.Context, conn *websocket.
|
||||
|
||||
// Forward stream events to WebSocket.
|
||||
for stream.Receive() {
|
||||
resp := stream.Msg()
|
||||
switch ev := resp.Event.(type) {
|
||||
case *pb.ConnectProcessResponse_Start:
|
||||
writeWSJSON(conn, wsOutMsg{Type: "start", PID: ev.Start.Pid})
|
||||
|
||||
case *pb.ConnectProcessResponse_Data:
|
||||
switch o := ev.Data.Output.(type) {
|
||||
case *pb.ExecStreamData_Stdout:
|
||||
writeWSJSON(conn, wsOutMsg{Type: "stdout", Data: string(o.Stdout)})
|
||||
case *pb.ExecStreamData_Stderr:
|
||||
writeWSJSON(conn, wsOutMsg{Type: "stderr", Data: string(o.Stderr)})
|
||||
}
|
||||
|
||||
case *pb.ConnectProcessResponse_End:
|
||||
exitCode := ev.End.ExitCode
|
||||
writeWSJSON(conn, wsOutMsg{Type: "exit", ExitCode: &exitCode})
|
||||
if m, ok := procRespToWSMsg(stream.Msg()); ok {
|
||||
writeWSJSON(conn, m)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -60,6 +60,10 @@ func agentErrToHTTP(err error) (int, string, string) {
|
||||
return http.StatusServiceUnavailable, "no_hosts_available", "no servers available — try again later"
|
||||
case connect.CodeUnimplemented:
|
||||
return http.StatusNotImplemented, "agent_error", err.Error()
|
||||
case connect.CodeDeadlineExceeded:
|
||||
return http.StatusGatewayTimeout, "timeout", "command timed out"
|
||||
case connect.CodeInternal:
|
||||
return http.StatusInternalServerError, "agent_error", err.Error()
|
||||
default:
|
||||
return http.StatusBadGateway, "agent_error", err.Error()
|
||||
}
|
||||
|
||||
@ -144,7 +144,7 @@ func (c *SandboxEventConsumer) handleMessage(ctx context.Context, msg redis.XMes
|
||||
}
|
||||
case events.CapsulePause:
|
||||
if event.Outcome == events.OutcomeSuccess {
|
||||
c.handleAutoPaused(ctx, sandboxID)
|
||||
c.handleAutoPaused(ctx, sandboxID, event)
|
||||
}
|
||||
case events.CapsuleDestroy:
|
||||
if event.Outcome == events.OutcomeSuccess {
|
||||
@ -226,12 +226,35 @@ func (c *SandboxEventConsumer) handleStarted(ctx context.Context, sandboxID pgty
|
||||
}
|
||||
}
|
||||
|
||||
func (c *SandboxEventConsumer) handleAutoPaused(ctx context.Context, sandboxID pgtype.UUID) {
|
||||
// handleAutoPaused reflects an autonomous (TTL reaper / shutdown) pause in the
|
||||
// DB and writes the audit row for it. The audit write happens only when the
|
||||
// status flip actually applied, so a stream redelivery does not double-count,
|
||||
// and so the HostMonitor host_state_sync fallback (which audits the
|
||||
// callback-lost case) stays mutually exclusive with this path.
|
||||
//
|
||||
// Uses audit.Log (row only) — NOT LogSandboxAutoPause, which republishes a
|
||||
// CapsulePause/system event that would loop straight back into this consumer.
|
||||
func (c *SandboxEventConsumer) handleAutoPaused(ctx context.Context, sandboxID pgtype.UUID, event events.Event) {
|
||||
for _, fromStatus := range []string{"running", "pausing"} {
|
||||
if _, err := c.db.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{
|
||||
ID: sandboxID, Status: fromStatus, Status_2: "paused",
|
||||
}); err == nil {
|
||||
slog.Debug("sandbox event consumer: auto-paused fallback applied", "sandbox_id", id.FormatSandboxID(sandboxID), "from", fromStatus)
|
||||
slog.Debug("sandbox event consumer: auto-paused applied", "sandbox_id", id.FormatSandboxID(sandboxID), "from", fromStatus)
|
||||
reason := event.Metadata["reason"]
|
||||
if reason == "" {
|
||||
reason = "ttl_expired"
|
||||
}
|
||||
teamID, _ := id.ParseTeamID(event.TeamID)
|
||||
c.audit.Log(ctx, audit.Entry{
|
||||
TeamID: teamID,
|
||||
ActorType: "system",
|
||||
ResourceType: "sandbox",
|
||||
ResourceID: id.FormatSandboxID(sandboxID),
|
||||
Action: "pause",
|
||||
Scope: "team",
|
||||
Status: "info",
|
||||
Metadata: map[string]any{"reason": reason},
|
||||
})
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
@ -104,6 +104,14 @@ func (r *SSERelay) handleMessage(ctx context.Context, msg *redis.Message) {
|
||||
if err != nil {
|
||||
slog.Debug("sse relay: sandbox hydration failed (may be deleted)", "sandbox_id", event.Resource.ID, "error", err)
|
||||
} else {
|
||||
// Override the hydrated status with the status implied by the event
|
||||
// verb. Autonomous transitions (e.g. TTL auto-pause) flip the DB row
|
||||
// in a separate stream consumer that races this Pub/Sub read, so the
|
||||
// hydrated row may still carry the pre-transition status. The event
|
||||
// itself is authoritative for the resulting state.
|
||||
if status, ok := impliedSandboxStatus(event); ok {
|
||||
sb.Status = status
|
||||
}
|
||||
payload.Sandbox = sb
|
||||
}
|
||||
}
|
||||
@ -138,6 +146,25 @@ func (r *SSERelay) hydrateSandbox(ctx context.Context, sandboxIDStr string) (*sa
|
||||
return &resp, nil
|
||||
}
|
||||
|
||||
// impliedSandboxStatus maps a successful capsule lifecycle event to the
|
||||
// sandbox status it results in. Used to override a hydrated DB row that may
|
||||
// still carry the pre-transition status because the reconciliation consumer
|
||||
// that flips it races this Pub/Sub read. Returns false for events with no
|
||||
// single deterministic resulting status (failures, destroy, state_changed).
|
||||
func impliedSandboxStatus(event events.Event) (string, bool) {
|
||||
if event.Outcome != events.OutcomeSuccess {
|
||||
return "", false
|
||||
}
|
||||
switch event.Event {
|
||||
case events.CapsulePause:
|
||||
return "paused", true
|
||||
case events.CapsuleResume, events.CapsuleCreate:
|
||||
return "running", true
|
||||
default:
|
||||
return "", false
|
||||
}
|
||||
}
|
||||
|
||||
func isCapsuleEvent(eventType string) bool {
|
||||
switch eventType {
|
||||
case events.CapsuleCreate, events.CapsulePause, events.CapsuleResume, events.CapsuleDestroy, events.CapsuleStateChanged:
|
||||
|
||||
@ -25,6 +25,7 @@ type Client struct {
|
||||
hostIP string
|
||||
base string
|
||||
healthURL string
|
||||
activityURL string
|
||||
httpClient *http.Client
|
||||
streamingClient *http.Client
|
||||
|
||||
@ -42,6 +43,7 @@ func New(hostIP string) *Client {
|
||||
hostIP: hostIP,
|
||||
base: base,
|
||||
healthURL: base + "/health",
|
||||
activityURL: base + "/activity",
|
||||
httpClient: httpClient,
|
||||
streamingClient: streamingClient,
|
||||
process: genconnect.NewProcessClient(streamingClient, base),
|
||||
@ -117,36 +119,17 @@ func (c *Client) Exec(ctx context.Context, cmd string, args []string, opts *Exec
|
||||
result := &ExecResult{}
|
||||
|
||||
for stream.Receive() {
|
||||
msg := stream.Msg()
|
||||
if msg.Event == nil {
|
||||
ev, ok := procEventToStreamEvent(stream.Msg().GetEvent())
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
|
||||
event := msg.Event.GetEvent()
|
||||
switch e := event.(type) {
|
||||
case *envdpb.ProcessEvent_Start:
|
||||
slog.Debug("process started", "pid", e.Start.GetPid())
|
||||
|
||||
case *envdpb.ProcessEvent_Data:
|
||||
output := e.Data.GetOutput()
|
||||
switch o := output.(type) {
|
||||
case *envdpb.ProcessEvent_DataEvent_Stdout:
|
||||
result.Stdout = append(result.Stdout, o.Stdout...)
|
||||
case *envdpb.ProcessEvent_DataEvent_Stderr:
|
||||
result.Stderr = append(result.Stderr, o.Stderr...)
|
||||
}
|
||||
|
||||
case *envdpb.ProcessEvent_End:
|
||||
result.ExitCode = e.End.GetExitCode()
|
||||
if e.End.Error != nil {
|
||||
slog.Debug("process ended with error",
|
||||
"exit_code", e.End.GetExitCode(),
|
||||
"error", e.End.GetError(),
|
||||
)
|
||||
}
|
||||
|
||||
case *envdpb.ProcessEvent_Keepalive:
|
||||
// Ignore keepalives.
|
||||
switch ev.Type {
|
||||
case "stdout":
|
||||
result.Stdout = append(result.Stdout, ev.Data...)
|
||||
case "stderr":
|
||||
result.Stderr = append(result.Stderr, ev.Data...)
|
||||
case "end":
|
||||
result.ExitCode = ev.ExitCode
|
||||
}
|
||||
}
|
||||
|
||||
@ -166,6 +149,76 @@ type ExecStreamEvent struct {
|
||||
Error string
|
||||
}
|
||||
|
||||
// procEventToStreamEvent converts a raw envd ProcessEvent into an
|
||||
// ExecStreamEvent. The second return is false for events with no payload to
|
||||
// forward (nil event, keepalive, unknown data variant) so callers can skip
|
||||
// them. This is the single decoder shared by Exec, ExecStream and
|
||||
// ConnectProcess.
|
||||
func procEventToStreamEvent(pe *envdpb.ProcessEvent) (ExecStreamEvent, bool) {
|
||||
if pe == nil {
|
||||
return ExecStreamEvent{}, false
|
||||
}
|
||||
switch e := pe.GetEvent().(type) {
|
||||
case *envdpb.ProcessEvent_Start:
|
||||
return ExecStreamEvent{Type: "start", PID: e.Start.GetPid()}, true
|
||||
case *envdpb.ProcessEvent_Data:
|
||||
switch o := e.Data.GetOutput().(type) {
|
||||
case *envdpb.ProcessEvent_DataEvent_Stdout:
|
||||
return ExecStreamEvent{Type: "stdout", Data: o.Stdout}, true
|
||||
case *envdpb.ProcessEvent_DataEvent_Stderr:
|
||||
return ExecStreamEvent{Type: "stderr", Data: o.Stderr}, true
|
||||
}
|
||||
return ExecStreamEvent{}, false
|
||||
case *envdpb.ProcessEvent_End:
|
||||
ev := ExecStreamEvent{Type: "end", ExitCode: e.End.GetExitCode()}
|
||||
if e.End.Error != nil {
|
||||
ev.Error = e.End.GetError()
|
||||
}
|
||||
return ev, true
|
||||
}
|
||||
return ExecStreamEvent{}, false
|
||||
}
|
||||
|
||||
// procEventStream is the subset of a Connect server-stream that pumpProcessEvents
|
||||
// needs. Both *connect.ServerStreamForClient[StartResponse] and
|
||||
// [ConnectResponse] satisfy it.
|
||||
type procEventStream[T any] interface {
|
||||
Receive() bool
|
||||
Msg() *T
|
||||
Err() error
|
||||
Close() error
|
||||
}
|
||||
|
||||
// pumpProcessEvents drains a process server-stream into ch until the stream ends
|
||||
// or ctx is cancelled, closing ch on exit. getEvent extracts the ProcessEvent
|
||||
// from each message so the same loop works for both the Start and Connect RPCs.
|
||||
func pumpProcessEvents[T any](
|
||||
ctx context.Context,
|
||||
stream procEventStream[T],
|
||||
getEvent func(*T) *envdpb.ProcessEvent,
|
||||
ch chan<- ExecStreamEvent,
|
||||
logLabel string,
|
||||
) {
|
||||
defer close(ch)
|
||||
defer stream.Close()
|
||||
|
||||
for stream.Receive() {
|
||||
ev, ok := procEventToStreamEvent(getEvent(stream.Msg()))
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
select {
|
||||
case ch <- ev:
|
||||
case <-ctx.Done():
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
if err := stream.Err(); err != nil && err != io.EOF {
|
||||
slog.Debug(logLabel, "error", err)
|
||||
}
|
||||
}
|
||||
|
||||
// ExecStream runs a command inside the sandbox and returns a channel of output events.
|
||||
// The channel is closed when the process ends or the context is cancelled.
|
||||
func (c *Client) ExecStream(ctx context.Context, cmd string, args ...string) (<-chan ExecStreamEvent, error) {
|
||||
@ -184,52 +237,7 @@ func (c *Client) ExecStream(ctx context.Context, cmd string, args ...string) (<-
|
||||
}
|
||||
|
||||
ch := make(chan ExecStreamEvent, 256)
|
||||
go func() {
|
||||
defer close(ch)
|
||||
defer stream.Close()
|
||||
|
||||
for stream.Receive() {
|
||||
msg := stream.Msg()
|
||||
if msg.Event == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
var ev ExecStreamEvent
|
||||
event := msg.Event.GetEvent()
|
||||
switch e := event.(type) {
|
||||
case *envdpb.ProcessEvent_Start:
|
||||
ev = ExecStreamEvent{Type: "start", PID: e.Start.GetPid()}
|
||||
|
||||
case *envdpb.ProcessEvent_Data:
|
||||
output := e.Data.GetOutput()
|
||||
switch o := output.(type) {
|
||||
case *envdpb.ProcessEvent_DataEvent_Stdout:
|
||||
ev = ExecStreamEvent{Type: "stdout", Data: o.Stdout}
|
||||
case *envdpb.ProcessEvent_DataEvent_Stderr:
|
||||
ev = ExecStreamEvent{Type: "stderr", Data: o.Stderr}
|
||||
}
|
||||
|
||||
case *envdpb.ProcessEvent_End:
|
||||
ev = ExecStreamEvent{Type: "end", ExitCode: e.End.GetExitCode()}
|
||||
if e.End.Error != nil {
|
||||
ev.Error = e.End.GetError()
|
||||
}
|
||||
|
||||
case *envdpb.ProcessEvent_Keepalive:
|
||||
continue
|
||||
}
|
||||
|
||||
select {
|
||||
case ch <- ev:
|
||||
case <-ctx.Done():
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
if err := stream.Err(); err != nil && err != io.EOF {
|
||||
slog.Debug("exec stream error", "error", err)
|
||||
}
|
||||
}()
|
||||
go pumpProcessEvents(ctx, stream, (*envdpb.StartResponse).GetEvent, ch, "exec stream error")
|
||||
|
||||
return ch, nil
|
||||
}
|
||||
@ -434,7 +442,7 @@ func (c *Client) CancelMemoryPreload(ctx context.Context) error {
|
||||
// post-restore initialization. sandbox_id and template_id are passed
|
||||
// so envd can set WRENN_SANDBOX_ID and WRENN_TEMPLATE_ID env vars.
|
||||
func (c *Client) PostInit(ctx context.Context) error {
|
||||
return c.PostInitWithDefaults(ctx, "", nil, "", "")
|
||||
return c.PostInitWithDefaults(ctx, "", nil, "", "", "")
|
||||
}
|
||||
|
||||
// PostInitWithDefaults calls envd's POST /init endpoint with optional default
|
||||
@ -444,7 +452,7 @@ func (c *Client) PostInit(ctx context.Context) error {
|
||||
// timestamp and lifecycle_id are always populated: envd uses them to snap
|
||||
// the guest clock to the host's wall time and to detect post-resume calls
|
||||
// (which trigger port-forwarder restart + NFS remount).
|
||||
func (c *Client) PostInitWithDefaults(ctx context.Context, defaultUser string, envVars map[string]string, sandboxID, templateID string) error {
|
||||
func (c *Client) PostInitWithDefaults(ctx context.Context, defaultUser string, envVars map[string]string, sandboxID, templateID, proxyDomain string) error {
|
||||
payload := map[string]any{
|
||||
"timestamp": time.Now().UTC().Format(time.RFC3339Nano),
|
||||
"lifecycle_id": uuid.NewString(),
|
||||
@ -461,6 +469,9 @@ func (c *Client) PostInitWithDefaults(ctx context.Context, defaultUser string, e
|
||||
if templateID != "" {
|
||||
payload["template_id"] = templateID
|
||||
}
|
||||
if proxyDomain != "" {
|
||||
payload["proxy_domain"] = proxyDomain
|
||||
}
|
||||
|
||||
var body io.Reader
|
||||
if len(payload) > 0 {
|
||||
|
||||
@ -81,6 +81,42 @@ func (c *Client) WaitUntilRPCReady(ctx context.Context) error {
|
||||
}
|
||||
}
|
||||
|
||||
// Activity is envd's liveness snapshot: VM-wide CPU utilisation and IO
|
||||
// throughput sampled inside the guest. The host activity sampler uses it to
|
||||
// decide whether a sandbox is doing real work and should keep its TTL fresh.
|
||||
type Activity struct {
|
||||
CPUCount uint32 `json:"cpu_count"`
|
||||
CPUUsedPct float32 `json:"cpu_used_pct"`
|
||||
NetBps uint64 `json:"net_bps"`
|
||||
DiskBps uint64 `json:"disk_bps"`
|
||||
}
|
||||
|
||||
// FetchActivity polls envd's /activity endpoint. The endpoint serves straight
|
||||
// from in-guest atomics (no syscalls), so it is cheap to call frequently.
|
||||
func (c *Client) FetchActivity(ctx context.Context) (*Activity, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, c.activityURL, nil)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("build activity request: %w", err)
|
||||
}
|
||||
|
||||
resp, err := c.httpClient.Do(req)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("fetch envd activity: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return nil, fmt.Errorf("activity check returned %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
var data Activity
|
||||
if err := json.NewDecoder(resp.Body).Decode(&data); err != nil {
|
||||
return nil, fmt.Errorf("decode activity response: %w", err)
|
||||
}
|
||||
|
||||
return &data, nil
|
||||
}
|
||||
|
||||
// healthCheck sends a single GET /health request to envd.
|
||||
func (c *Client) healthCheck(ctx context.Context) error {
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, c.healthURL, nil)
|
||||
|
||||
@ -4,7 +4,6 @@ import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
|
||||
"connectrpc.com/connect"
|
||||
|
||||
@ -87,52 +86,7 @@ func (c *Client) ConnectProcess(ctx context.Context, pid uint32, tag string) (<-
|
||||
}
|
||||
|
||||
ch := make(chan ExecStreamEvent, 16)
|
||||
go func() {
|
||||
defer close(ch)
|
||||
defer stream.Close()
|
||||
|
||||
for stream.Receive() {
|
||||
msg := stream.Msg()
|
||||
if msg.Event == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
var ev ExecStreamEvent
|
||||
switch e := msg.Event.GetEvent().(type) {
|
||||
case *envdpb.ProcessEvent_Start:
|
||||
ev = ExecStreamEvent{Type: "start", PID: e.Start.GetPid()}
|
||||
|
||||
case *envdpb.ProcessEvent_Data:
|
||||
switch o := e.Data.GetOutput().(type) {
|
||||
case *envdpb.ProcessEvent_DataEvent_Stdout:
|
||||
ev = ExecStreamEvent{Type: "stdout", Data: o.Stdout}
|
||||
case *envdpb.ProcessEvent_DataEvent_Stderr:
|
||||
ev = ExecStreamEvent{Type: "stderr", Data: o.Stderr}
|
||||
default:
|
||||
continue
|
||||
}
|
||||
|
||||
case *envdpb.ProcessEvent_End:
|
||||
ev = ExecStreamEvent{Type: "end", ExitCode: e.End.GetExitCode()}
|
||||
if e.End.Error != nil {
|
||||
ev.Error = e.End.GetError()
|
||||
}
|
||||
|
||||
case *envdpb.ProcessEvent_Keepalive:
|
||||
continue
|
||||
}
|
||||
|
||||
select {
|
||||
case ch <- ev:
|
||||
case <-ctx.Done():
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
if err := stream.Err(); err != nil && err != io.EOF {
|
||||
slog.Debug("connect process stream error", "error", err)
|
||||
}
|
||||
}()
|
||||
go pumpProcessEvents(ctx, stream, (*envdpb.ConnectResponse).GetEvent, ch, "connect process stream error")
|
||||
|
||||
return ch, nil
|
||||
}
|
||||
|
||||
@ -253,7 +253,7 @@ func (s *Server) Exec(
|
||||
|
||||
result, err := s.mgr.Exec(execCtx, msg.SandboxId, msg.Cmd, msg.Args, opts)
|
||||
if err != nil {
|
||||
return nil, connect.NewError(connect.CodeInternal, fmt.Errorf("exec: %w", err))
|
||||
return nil, envdErr("exec", err)
|
||||
}
|
||||
|
||||
return connect.NewResponse(&pb.ExecResponse{
|
||||
@ -395,31 +395,15 @@ func (s *Server) ExecStream(
|
||||
}
|
||||
|
||||
for ev := range events {
|
||||
start, data, end := execEventParts(ev)
|
||||
var resp pb.ExecStreamResponse
|
||||
switch ev.Type {
|
||||
case "start":
|
||||
resp.Event = &pb.ExecStreamResponse_Start{
|
||||
Start: &pb.ExecStreamStart{Pid: ev.PID},
|
||||
}
|
||||
case "stdout":
|
||||
resp.Event = &pb.ExecStreamResponse_Data{
|
||||
Data: &pb.ExecStreamData{
|
||||
Output: &pb.ExecStreamData_Stdout{Stdout: ev.Data},
|
||||
},
|
||||
}
|
||||
case "stderr":
|
||||
resp.Event = &pb.ExecStreamResponse_Data{
|
||||
Data: &pb.ExecStreamData{
|
||||
Output: &pb.ExecStreamData_Stderr{Stderr: ev.Data},
|
||||
},
|
||||
}
|
||||
case "end":
|
||||
resp.Event = &pb.ExecStreamResponse_End{
|
||||
End: &pb.ExecStreamEnd{
|
||||
ExitCode: ev.ExitCode,
|
||||
Error: ev.Error,
|
||||
},
|
||||
}
|
||||
switch {
|
||||
case start != nil:
|
||||
resp.Event = &pb.ExecStreamResponse_Start{Start: start}
|
||||
case data != nil:
|
||||
resp.Event = &pb.ExecStreamResponse_Data{Data: data}
|
||||
case end != nil:
|
||||
resp.Event = &pb.ExecStreamResponse_End{End: end}
|
||||
default:
|
||||
continue
|
||||
}
|
||||
@ -431,6 +415,24 @@ func (s *Server) ExecStream(
|
||||
return nil
|
||||
}
|
||||
|
||||
// execEventParts maps a streaming exec event to its proto inner message.
|
||||
// Exactly one return value is non-nil; all-nil means the event carries nothing
|
||||
// to forward. Shared by ExecStream and ConnectProcess, which differ only in the
|
||||
// response envelope wrapping these inner messages.
|
||||
func execEventParts(ev envdclient.ExecStreamEvent) (*pb.ExecStreamStart, *pb.ExecStreamData, *pb.ExecStreamEnd) {
|
||||
switch ev.Type {
|
||||
case "start":
|
||||
return &pb.ExecStreamStart{Pid: ev.PID}, nil, nil
|
||||
case "stdout":
|
||||
return nil, &pb.ExecStreamData{Output: &pb.ExecStreamData_Stdout{Stdout: ev.Data}}, nil
|
||||
case "stderr":
|
||||
return nil, &pb.ExecStreamData{Output: &pb.ExecStreamData_Stderr{Stderr: ev.Data}}, nil
|
||||
case "end":
|
||||
return nil, nil, &pb.ExecStreamEnd{ExitCode: ev.ExitCode, Error: ev.Error}
|
||||
}
|
||||
return nil, nil, nil
|
||||
}
|
||||
|
||||
func (s *Server) WriteFileStream(
|
||||
ctx context.Context,
|
||||
stream *connect.ClientStream[pb.WriteFileStreamRequest],
|
||||
@ -912,31 +914,15 @@ func (s *Server) ConnectProcess(
|
||||
}
|
||||
|
||||
for ev := range events {
|
||||
start, data, end := execEventParts(ev)
|
||||
var resp pb.ConnectProcessResponse
|
||||
switch ev.Type {
|
||||
case "start":
|
||||
resp.Event = &pb.ConnectProcessResponse_Start{
|
||||
Start: &pb.ExecStreamStart{Pid: ev.PID},
|
||||
}
|
||||
case "stdout":
|
||||
resp.Event = &pb.ConnectProcessResponse_Data{
|
||||
Data: &pb.ExecStreamData{
|
||||
Output: &pb.ExecStreamData_Stdout{Stdout: ev.Data},
|
||||
},
|
||||
}
|
||||
case "stderr":
|
||||
resp.Event = &pb.ConnectProcessResponse_Data{
|
||||
Data: &pb.ExecStreamData{
|
||||
Output: &pb.ExecStreamData_Stderr{Stderr: ev.Data},
|
||||
},
|
||||
}
|
||||
case "end":
|
||||
resp.Event = &pb.ConnectProcessResponse_End{
|
||||
End: &pb.ExecStreamEnd{
|
||||
ExitCode: ev.ExitCode,
|
||||
Error: ev.Error,
|
||||
},
|
||||
}
|
||||
switch {
|
||||
case start != nil:
|
||||
resp.Event = &pb.ConnectProcessResponse_Start{Start: start}
|
||||
case data != nil:
|
||||
resp.Event = &pb.ConnectProcessResponse_Data{Data: data}
|
||||
case end != nil:
|
||||
resp.Event = &pb.ConnectProcessResponse_End{End: end}
|
||||
default:
|
||||
continue
|
||||
}
|
||||
|
||||
111
internal/sandbox/activity_test.go
Normal file
111
internal/sandbox/activity_test.go
Normal file
@ -0,0 +1,111 @@
|
||||
package sandbox
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"git.omukk.dev/wrenn/wrenn/internal/envdclient"
|
||||
)
|
||||
|
||||
func TestIsBusy(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
cfg Config
|
||||
act envdclient.Activity
|
||||
want bool
|
||||
}{
|
||||
// Default thresholds (zero cfg → defaults: cpu 5%, net 16K, disk 32K).
|
||||
{"idle", Config{}, envdclient.Activity{CPUUsedPct: 0.5, NetBps: 100, DiskBps: 200}, false},
|
||||
{"cpu just below", Config{}, envdclient.Activity{CPUUsedPct: 4.99}, false},
|
||||
{"cpu at threshold", Config{}, envdclient.Activity{CPUUsedPct: 5.0}, true},
|
||||
{"cpu above", Config{}, envdclient.Activity{CPUUsedPct: 80.0}, true},
|
||||
{"net just below", Config{}, envdclient.Activity{NetBps: 16*1024 - 1}, false},
|
||||
{"net at floor", Config{}, envdclient.Activity{NetBps: 16 * 1024}, true},
|
||||
{"disk just below", Config{}, envdclient.Activity{DiskBps: 32*1024 - 1}, false},
|
||||
{"disk at floor", Config{}, envdclient.Activity{DiskBps: 32 * 1024}, true},
|
||||
{"download: low cpu, high net", Config{}, envdclient.Activity{CPUUsedPct: 1.0, NetBps: 5 * 1024 * 1024}, true},
|
||||
|
||||
// Explicit overrides take precedence over defaults.
|
||||
{
|
||||
"custom cpu threshold met",
|
||||
Config{CPUBusyPct: 20.0},
|
||||
envdclient.Activity{CPUUsedPct: 25.0},
|
||||
true,
|
||||
},
|
||||
{
|
||||
"custom cpu threshold not met",
|
||||
Config{CPUBusyPct: 20.0},
|
||||
envdclient.Activity{CPUUsedPct: 10.0},
|
||||
false,
|
||||
},
|
||||
{
|
||||
"custom net floor not met",
|
||||
Config{NetFloorBps: 1024 * 1024},
|
||||
envdclient.Activity{NetBps: 16 * 1024},
|
||||
false,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
m := &Manager{cfg: tt.cfg}
|
||||
if got := m.isBusy(&tt.act); got != tt.want {
|
||||
t.Errorf("isBusy(%+v) = %v, want %v", tt.act, got, tt.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplyBusySample(t *testing.T) {
|
||||
// Debounce requires busyDebounceSamples consecutive busy samples before the
|
||||
// first bump. Verify the streak math and bump timing.
|
||||
if busyDebounceSamples != 2 {
|
||||
t.Skip("test written for busyDebounceSamples=2")
|
||||
}
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
startStreak int
|
||||
busy bool
|
||||
wantStreak int
|
||||
wantBump bool
|
||||
}{
|
||||
{"first busy, no bump yet", 0, true, 1, false},
|
||||
{"second consecutive busy, bump", 1, true, 2, true},
|
||||
{"sustained busy keeps bumping, streak held", 2, true, 2, true},
|
||||
{"single noise spike from idle, no bump", 0, false, 0, false},
|
||||
{"idle resets a building streak", 1, false, 0, false},
|
||||
{"idle resets a saturated streak", 2, false, 0, false},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
gotStreak, gotBump := applyBusySample(tt.startStreak, tt.busy)
|
||||
if gotStreak != tt.wantStreak || gotBump != tt.wantBump {
|
||||
t.Errorf("applyBusySample(%d, %v) = (%d, %v), want (%d, %v)",
|
||||
tt.startStreak, tt.busy, gotStreak, gotBump, tt.wantStreak, tt.wantBump)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestApplyBusySample_NoiseScenario walks a realistic sample sequence: brief
|
||||
// noise never crosses the debounce, but sustained work does and then a return
|
||||
// to idle resets — proving an isolated spike cannot keep a sandbox alive.
|
||||
func TestApplyBusySample_NoiseScenario(t *testing.T) {
|
||||
if busyDebounceSamples != 2 {
|
||||
t.Skip("test written for busyDebounceSamples=2")
|
||||
}
|
||||
|
||||
samples := []bool{true, false, false, true, true, true, false}
|
||||
wantBumps := []bool{false, false, false, false, true, true, false}
|
||||
|
||||
streak := 0
|
||||
for i, busy := range samples {
|
||||
var bump bool
|
||||
streak, bump = applyBusySample(streak, busy)
|
||||
if bump != wantBumps[i] {
|
||||
t.Errorf("sample %d (busy=%v): bump = %v, want %v (streak=%d)",
|
||||
i, busy, bump, wantBumps[i], streak)
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -88,14 +88,47 @@ type Config struct {
|
||||
EnvdTimeout time.Duration
|
||||
DefaultRootfsSizeMB int // target size for template rootfs images; 0 → DefaultDiskSizeMB
|
||||
|
||||
// ProxyDomain is the public domain sandboxes are served under (e.g.
|
||||
// "wrenn.dev"). Injected into envd at /init so `envd ports` can build
|
||||
// {port}-{sandbox_id}.{domain} URLs.
|
||||
ProxyDomain string
|
||||
|
||||
// Resolved at startup by the host agent.
|
||||
KernelPath string // path to the latest vmlinux-x.y.z
|
||||
KernelVersion string // semver extracted from filename
|
||||
VMMBin string // path to the cloud-hypervisor binary
|
||||
VMMVersion string // semver from cloud-hypervisor --version
|
||||
AgentVersion string // host agent version (injected via ldflags)
|
||||
|
||||
// Activity sampler thresholds. The sampler polls each running sandbox's
|
||||
// guest liveness and refreshes its TTL when it is doing real work, so a
|
||||
// long-running but non-interactive job is not mistaken for inactive. A
|
||||
// sandbox counts as busy when guest CPU ≥ CPUBusyPct, or net/disk
|
||||
// throughput ≥ the respective floor (bytes/sec). Zero values fall back to
|
||||
// the package defaults at sampler start.
|
||||
ActivitySampleInterval time.Duration
|
||||
CPUBusyPct float32
|
||||
NetFloorBps uint64
|
||||
DiskFloorBps uint64
|
||||
}
|
||||
|
||||
// Activity sampler defaults. Thresholds sit clear of idle-VM background noise
|
||||
// (envd's own sampler thread, guest timers) so a parked sandbox still times
|
||||
// out; the debounce below guards against a lone noisy sample masquerading as
|
||||
// work. All are env-overridable on the host agent.
|
||||
const (
|
||||
defaultActivitySampleInterval = 5 * time.Second
|
||||
defaultCPUBusyPct = 5.0 // percent of total vCPU capacity
|
||||
defaultNetFloorBps = 16 * 1024 // 16 KB/s
|
||||
defaultDiskFloorBps = 32 * 1024 // 32 KB/s
|
||||
activityPollTimeout = 3 * time.Second
|
||||
activitySampleConcurrency = 16
|
||||
// busyDebounceSamples is how many consecutive busy samples are required
|
||||
// before the sandbox's TTL is refreshed. With a 5s interval, real work
|
||||
// registers within ~10s while isolated noise spikes are ignored.
|
||||
busyDebounceSamples = 2
|
||||
)
|
||||
|
||||
// LifecycleEvent describes an autonomous state change initiated by the agent.
|
||||
type LifecycleEvent struct {
|
||||
Event string
|
||||
@ -189,6 +222,12 @@ type sandboxState struct {
|
||||
ring *metricsRing // tiered ring buffers for CPU/mem/disk metrics
|
||||
samplerCancel context.CancelFunc // cancels the per-sandbox sampling goroutine
|
||||
samplerDone chan struct{} // closed when the sampling goroutine exits
|
||||
|
||||
// activityBusyStreak counts consecutive busy activity samples. A single
|
||||
// noisy sample (idle background CPU, a stray packet) must not refresh the
|
||||
// TTL, so LastActiveAt is only bumped once the streak reaches
|
||||
// busyDebounceSamples. Reset to 0 by any non-busy sample. Guarded by m.mu.
|
||||
activityBusyStreak int
|
||||
}
|
||||
|
||||
// buildMetadata constructs the metadata map with version information.
|
||||
@ -419,14 +458,14 @@ func (m *Manager) Create(
|
||||
// Fetch envd version (best-effort).
|
||||
envdVersion, _ := client.FetchVersion(ctx)
|
||||
|
||||
// Apply template defaults via envd /init (no-op when both empty).
|
||||
if defaultUser != "" || len(defaultEnv) > 0 {
|
||||
initCtx, initCancel := context.WithTimeout(ctx, m.cfg.EnvdTimeout)
|
||||
if err := client.PostInitWithDefaults(initCtx, defaultUser, defaultEnv, sandboxID, id.UUIDString(templateID)); err != nil {
|
||||
slog.Warn("post-create PostInit failed", "id", sandboxID, "error", err)
|
||||
}
|
||||
initCancel()
|
||||
// Apply template defaults + sandbox identity via envd /init. Always called
|
||||
// on create so envd records its sandbox ID and proxy domain (used by
|
||||
// `envd ports`), even when the template specifies no user/env defaults.
|
||||
initCtx, initCancel := context.WithTimeout(ctx, m.cfg.EnvdTimeout)
|
||||
if err := client.PostInitWithDefaults(initCtx, defaultUser, defaultEnv, sandboxID, id.UUIDString(templateID), m.cfg.ProxyDomain); err != nil {
|
||||
slog.Warn("post-create PostInit failed", "id", sandboxID, "error", err)
|
||||
}
|
||||
initCancel()
|
||||
|
||||
now := time.Now()
|
||||
sb := &sandboxState{
|
||||
@ -667,7 +706,7 @@ func (m *Manager) SetDefaults(ctx context.Context, sandboxID, defaultUser string
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return c.PostInitWithDefaults(ctx, defaultUser, defaultEnv, "", "")
|
||||
return c.PostInitWithDefaults(ctx, defaultUser, defaultEnv, "", "", "")
|
||||
}
|
||||
|
||||
// PtyAttach starts a new PTY process or reconnects to an existing one.
|
||||
@ -762,6 +801,11 @@ func (m *Manager) AcquireProxyConn(sandboxID string) (net.IP, *ConnTracker, bool
|
||||
if !sb.connTracker.Acquire() {
|
||||
return nil, nil, false
|
||||
}
|
||||
// Inbound proxy traffic counts as activity: an idle web server reachable
|
||||
// through the proxy should not be auto-paused while it is serving requests.
|
||||
m.mu.Lock()
|
||||
sb.LastActiveAt = time.Now()
|
||||
m.mu.Unlock()
|
||||
return sb.HostIP, sb.connTracker, true
|
||||
}
|
||||
|
||||
@ -872,6 +916,146 @@ func (m *Manager) reapExpired(_ context.Context) {
|
||||
}
|
||||
}
|
||||
|
||||
// StartActivitySampler starts a background goroutine that polls each running
|
||||
// sandbox's guest liveness (CPU + net/disk IO) and refreshes LastActiveAt when
|
||||
// the sandbox is doing real work. This is what keeps a long-running but
|
||||
// non-interactive job (a build, a download) from being auto-paused by the TTL
|
||||
// reaper, while an idle workload (sleep, a parked shell) still times out.
|
||||
func (m *Manager) StartActivitySampler(ctx context.Context) {
|
||||
interval := m.cfg.ActivitySampleInterval
|
||||
if interval <= 0 {
|
||||
interval = defaultActivitySampleInterval
|
||||
}
|
||||
|
||||
go func() {
|
||||
ticker := time.NewTicker(interval)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-m.stopCh:
|
||||
return
|
||||
case <-ticker.C:
|
||||
m.sampleActivity(ctx)
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
// activityTarget pairs a sandbox ID with the envd client to poll.
|
||||
type activityTarget struct {
|
||||
id string
|
||||
client *envdclient.Client
|
||||
}
|
||||
|
||||
func (m *Manager) sampleActivity(ctx context.Context) {
|
||||
// Snapshot the running sandboxes and their clients under the lock, then
|
||||
// poll over the network without holding it.
|
||||
m.mu.RLock()
|
||||
targets := make([]activityTarget, 0, len(m.boxes))
|
||||
for id, sb := range m.boxes {
|
||||
if sb.Status != models.StatusRunning {
|
||||
continue
|
||||
}
|
||||
// Skip sandboxes still loading memory after a resume — they are not
|
||||
// settled and their IO/CPU is preload noise, not user work.
|
||||
if sb.memLoadDone != nil {
|
||||
select {
|
||||
case <-sb.memLoadDone:
|
||||
default:
|
||||
continue
|
||||
}
|
||||
}
|
||||
c := sb.client.Load()
|
||||
if c == nil {
|
||||
continue
|
||||
}
|
||||
targets = append(targets, activityTarget{id: id, client: c})
|
||||
}
|
||||
m.mu.RUnlock()
|
||||
|
||||
if len(targets) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
sem := make(chan struct{}, activitySampleConcurrency)
|
||||
var wg sync.WaitGroup
|
||||
for _, t := range targets {
|
||||
wg.Add(1)
|
||||
sem <- struct{}{}
|
||||
go func(t activityTarget) {
|
||||
defer wg.Done()
|
||||
defer func() { <-sem }()
|
||||
m.pollAndBump(ctx, t)
|
||||
}(t)
|
||||
}
|
||||
wg.Wait()
|
||||
}
|
||||
|
||||
// pollAndBump fetches one sandbox's activity and refreshes its TTL once it has
|
||||
// been busy for busyDebounceSamples consecutive samples. Poll failures are
|
||||
// treated as a non-busy sample: an unreachable envd is handled by the reaper /
|
||||
// heartbeat paths, and resetting the streak is the safe default.
|
||||
func (m *Manager) pollAndBump(ctx context.Context, t activityTarget) {
|
||||
pollCtx, cancel := context.WithTimeout(ctx, activityPollTimeout)
|
||||
defer cancel()
|
||||
|
||||
act, err := t.client.FetchActivity(pollCtx)
|
||||
busy := err == nil && m.isBusy(act)
|
||||
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
|
||||
sb, ok := m.boxes[t.id]
|
||||
if !ok || sb.Status != models.StatusRunning {
|
||||
return
|
||||
}
|
||||
|
||||
streak, bump := applyBusySample(sb.activityBusyStreak, busy)
|
||||
sb.activityBusyStreak = streak
|
||||
if bump {
|
||||
sb.LastActiveAt = time.Now()
|
||||
}
|
||||
}
|
||||
|
||||
// applyBusySample advances a debounce streak with the latest sample and
|
||||
// reports whether the TTL should be refreshed this tick. A non-busy sample
|
||||
// resets the streak; the bump fires once the streak reaches the debounce
|
||||
// threshold and on every busy tick thereafter (the streak is held at the
|
||||
// threshold rather than growing unbounded).
|
||||
func applyBusySample(streak int, busy bool) (newStreak int, bump bool) {
|
||||
if !busy {
|
||||
return 0, false
|
||||
}
|
||||
streak++
|
||||
if streak >= busyDebounceSamples {
|
||||
return busyDebounceSamples, true
|
||||
}
|
||||
return streak, false
|
||||
}
|
||||
|
||||
// isBusy reports whether a guest liveness snapshot represents real work.
|
||||
func (m *Manager) isBusy(act *envdclient.Activity) bool {
|
||||
cpuThreshold := m.cfg.CPUBusyPct
|
||||
if cpuThreshold <= 0 {
|
||||
cpuThreshold = defaultCPUBusyPct
|
||||
}
|
||||
netFloor := m.cfg.NetFloorBps
|
||||
if netFloor == 0 {
|
||||
netFloor = defaultNetFloorBps
|
||||
}
|
||||
diskFloor := m.cfg.DiskFloorBps
|
||||
if diskFloor == 0 {
|
||||
diskFloor = defaultDiskFloorBps
|
||||
}
|
||||
|
||||
return act.CPUUsedPct >= cpuThreshold ||
|
||||
act.NetBps >= netFloor ||
|
||||
act.DiskBps >= diskFloor
|
||||
}
|
||||
|
||||
// Shutdown gracefully drains the manager. Running sandboxes are paused so
|
||||
// their state survives across agent restarts; any sandboxes still holding
|
||||
// runtime resources after PauseAll (e.g. paused failed, or status was
|
||||
|
||||
@ -110,7 +110,7 @@ func (m *Manager) initAndStartMemoryLoader(ctx context.Context, sb *sandboxState
|
||||
slog.Warn("post-restore PostInit skipped: envd client cleared", "id", sb.ID)
|
||||
return
|
||||
}
|
||||
if err := c.PostInitWithDefaults(initCtx, defaultUser, envVars, sb.ID, templateIDStr); err != nil {
|
||||
if err := c.PostInitWithDefaults(initCtx, defaultUser, envVars, sb.ID, templateIDStr, m.cfg.ProxyDomain); err != nil {
|
||||
slog.Warn("post-restore PostInit failed", "id", sb.ID, "error", err)
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user