1
0
forked from wrenn/wrenn

feat: async sandbox lifecycle with Redis Stream events

Replace synchronous RPC-based CP-host communication for sandbox
lifecycle operations (Create, Pause, Resume, Destroy) with an async
pattern. CP handlers now return 202 Accepted immediately, fire agent
RPCs in background goroutines, and publish state events to a Redis
Stream. A background consumer processes events as a fallback writer.

Agent-side auto-pause events are pushed to the CP via HTTP callback
(POST /v1/hosts/sandbox-events), keeping Redis internal to the CP.

All DB status transitions use conditional updates
(UpdateSandboxStatusIf, UpdateSandboxRunningIf) to prevent race
conditions between concurrent operations and background goroutines.

The HostMonitor reconciler is kept at 60s as a safety net, extended
to handle transient statuses (starting, pausing, resuming, stopping).

Frontend updated to handle 202 responses with empty bodies and render
transient statuses with blue indicators.
This commit is contained in:
2026-05-15 12:25:16 +06:00
parent c08884fa2c
commit 6faad45a28
18 changed files with 944 additions and 128 deletions

View File

@ -187,8 +187,13 @@ func Run(opts ...Option) {
// Start channel event dispatcher.
channelDispatcher.Start(ctx)
// Start host monitor (passive + active reconciliation every 30s).
monitor := api.NewHostMonitor(queries, hostPool, al, 15*time.Second)
// Start sandbox event consumer (processes lifecycle events from Redis stream).
sandboxEventConsumer := api.NewSandboxEventConsumer(rdb, queries, al)
sandboxEventConsumer.Start(ctx)
// Start host monitor (passive + active reconciliation every 60s).
// Reduced from 15s since async events handle the normal case.
monitor := api.NewHostMonitor(queries, hostPool, al, 60*time.Second)
monitor.Start(ctx)
// Hard-delete accounts that have been soft-deleted for more than 15 days (runs every 24h).

View File

@ -375,7 +375,7 @@ const markSandboxesMissingByHost = `-- name: MarkSandboxesMissingByHost :exec
UPDATE sandboxes
SET status = 'missing',
last_updated = NOW()
WHERE host_id = $1 AND status IN ('running', 'starting', 'pending')
WHERE host_id = $1 AND status IN ('running', 'starting', 'pending', 'pausing', 'resuming', 'stopping')
`
// Called when the host monitor marks a host unreachable.
@ -470,6 +470,61 @@ func (q *Queries) UpdateSandboxRunning(ctx context.Context, arg UpdateSandboxRun
return i, err
}
const updateSandboxRunningIf = `-- name: UpdateSandboxRunningIf :one
UPDATE sandboxes
SET status = 'running',
host_ip = $3,
guest_ip = $4,
started_at = $5,
last_active_at = $5,
last_updated = NOW()
WHERE id = $1 AND status = $2
RETURNING id, team_id, host_id, template, status, vcpus, memory_mb, timeout_sec, disk_size_mb, guest_ip, host_ip, created_at, started_at, last_active_at, last_updated, template_id, template_team_id, metadata
`
type UpdateSandboxRunningIfParams struct {
ID pgtype.UUID `json:"id"`
Status string `json:"status"`
HostIp string `json:"host_ip"`
GuestIp string `json:"guest_ip"`
StartedAt pgtype.Timestamptz `json:"started_at"`
}
// Conditionally transition a sandbox to running only if the current status
// matches the expected value. Prevents races where a user destroys a sandbox
// while the create/resume goroutine is still in-flight.
func (q *Queries) UpdateSandboxRunningIf(ctx context.Context, arg UpdateSandboxRunningIfParams) (Sandbox, error) {
row := q.db.QueryRow(ctx, updateSandboxRunningIf,
arg.ID,
arg.Status,
arg.HostIp,
arg.GuestIp,
arg.StartedAt,
)
var i Sandbox
err := row.Scan(
&i.ID,
&i.TeamID,
&i.HostID,
&i.Template,
&i.Status,
&i.Vcpus,
&i.MemoryMb,
&i.TimeoutSec,
&i.DiskSizeMb,
&i.GuestIp,
&i.HostIp,
&i.CreatedAt,
&i.StartedAt,
&i.LastActiveAt,
&i.LastUpdated,
&i.TemplateID,
&i.TemplateTeamID,
&i.Metadata,
)
return i, err
}
const updateSandboxStatus = `-- name: UpdateSandboxStatus :one
UPDATE sandboxes
SET status = $2,
@ -508,3 +563,46 @@ func (q *Queries) UpdateSandboxStatus(ctx context.Context, arg UpdateSandboxStat
)
return i, err
}
const updateSandboxStatusIf = `-- name: UpdateSandboxStatusIf :one
UPDATE sandboxes
SET status = $3,
last_updated = NOW()
WHERE id = $1 AND status = $2
RETURNING id, team_id, host_id, template, status, vcpus, memory_mb, timeout_sec, disk_size_mb, guest_ip, host_ip, created_at, started_at, last_active_at, last_updated, template_id, template_team_id, metadata
`
type UpdateSandboxStatusIfParams struct {
ID pgtype.UUID `json:"id"`
Status string `json:"status"`
Status_2 string `json:"status_2"`
}
// Atomically update status only when the current status matches the expected value.
// Prevents background goroutines from overwriting a status that has since changed
// (e.g. user destroyed a sandbox while Create was in-flight).
func (q *Queries) UpdateSandboxStatusIf(ctx context.Context, arg UpdateSandboxStatusIfParams) (Sandbox, error) {
row := q.db.QueryRow(ctx, updateSandboxStatusIf, arg.ID, arg.Status, arg.Status_2)
var i Sandbox
err := row.Scan(
&i.ID,
&i.TeamID,
&i.HostID,
&i.Template,
&i.Status,
&i.Vcpus,
&i.MemoryMb,
&i.TimeoutSec,
&i.DiskSizeMb,
&i.GuestIp,
&i.HostIp,
&i.CreatedAt,
&i.StartedAt,
&i.LastActiveAt,
&i.LastUpdated,
&i.TemplateID,
&i.TemplateTeamID,
&i.Metadata,
)
return i, err
}

View File

@ -18,12 +18,27 @@ import (
pb "git.omukk.dev/wrenn/wrenn/proto/hostagent/gen"
)
// SandboxEventPublisher writes sandbox lifecycle events to the Redis stream.
type SandboxEventPublisher func(ctx context.Context, event SandboxStateEvent)
// SandboxStateEvent is the event payload published to the Redis stream.
type SandboxStateEvent struct {
Event string `json:"event"`
SandboxID string `json:"sandbox_id"`
HostID string `json:"host_id"`
HostIP string `json:"host_ip,omitempty"`
Metadata map[string]string `json:"metadata,omitempty"`
Error string `json:"error,omitempty"`
Timestamp int64 `json:"timestamp"`
}
// SandboxService provides sandbox lifecycle operations shared between the
// REST API and the dashboard.
type SandboxService struct {
DB *db.Queries
Pool *lifecycle.HostClientPool
Scheduler scheduler.HostScheduler
DB *db.Queries
Pool *lifecycle.HostClientPool
Scheduler scheduler.HostScheduler
PublishEvent SandboxEventPublisher
}
// SandboxCreateParams holds the parameters for creating a sandbox.
@ -53,6 +68,12 @@ func (s *SandboxService) agentForSandbox(ctx context.Context, sandboxID pgtype.U
return agent, sb, nil
}
func (s *SandboxService) publishEvent(ctx context.Context, event SandboxStateEvent) {
if s.PublishEvent != nil {
s.PublishEvent(ctx, event)
}
}
// hostagentClient is a local alias to avoid the full package path in signatures.
type hostagentClient = interface {
CreateSandbox(ctx context.Context, req *connect.Request[pb.CreateSandboxRequest]) (*connect.Response[pb.CreateSandboxResponse], error)
@ -64,8 +85,10 @@ type hostagentClient = interface {
FlushSandboxMetrics(ctx context.Context, req *connect.Request[pb.FlushSandboxMetricsRequest]) (*connect.Response[pb.FlushSandboxMetricsResponse], error)
}
// Create creates a new sandbox: picks a host via the scheduler, inserts a pending
// DB record, calls the host agent, and updates the record to running.
// Create creates a new sandbox asynchronously: picks a host, inserts a
// "starting" DB record, fires the agent RPC in a background goroutine, and
// returns the sandbox immediately. The background goroutine publishes a
// sandbox event to the Redis stream when the operation completes.
func (s *SandboxService) Create(ctx context.Context, p SandboxCreateParams) (db.Sandbox, error) {
if p.Template == "" {
p.Template = "minimal"
@ -96,11 +119,9 @@ func (s *SandboxService) Create(ctx context.Context, p SandboxCreateParams) (db.
templateTeamID = tmpl.TeamID
templateID = tmpl.ID
templateDefaultUser = tmpl.DefaultUser
// Parse default_env JSONB into a map.
if len(tmpl.DefaultEnv) > 0 {
_ = json.Unmarshal(tmpl.DefaultEnv, &templateDefaultEnv)
}
// If the template is a snapshot, use its baked-in vcpus/memory.
if tmpl.Type == "snapshot" {
p.VCPUs = tmpl.Vcpus
p.MemoryMB = tmpl.MemoryMb
@ -111,13 +132,11 @@ func (s *SandboxService) Create(ctx context.Context, p SandboxCreateParams) (db.
return db.Sandbox{}, fmt.Errorf("invalid request: team_id is required")
}
// Determine whether this team uses BYOC hosts or platform hosts.
team, err := s.DB.GetTeam(ctx, p.TeamID)
if err != nil {
return db.Sandbox{}, fmt.Errorf("team not found: %w", err)
}
// Pick a host for this sandbox.
host, err := s.Scheduler.SelectHost(ctx, p.TeamID, team.IsByoc, p.MemoryMB, p.DiskSizeMB)
if err != nil {
return db.Sandbox{}, fmt.Errorf("select host: %w", err)
@ -130,13 +149,14 @@ func (s *SandboxService) Create(ctx context.Context, p SandboxCreateParams) (db.
sandboxID := id.NewSandboxID()
sandboxIDStr := id.FormatSandboxID(sandboxID)
hostIDStr := id.FormatHostID(host.ID)
if _, err := s.DB.InsertSandbox(ctx, db.InsertSandboxParams{
sb, err := s.DB.InsertSandbox(ctx, db.InsertSandboxParams{
ID: sandboxID,
TeamID: p.TeamID,
HostID: host.ID,
Template: p.Template,
Status: "pending",
Status: "starting",
Vcpus: p.VCPUs,
MemoryMb: p.MemoryMB,
TimeoutSec: p.TimeoutSec,
@ -144,11 +164,26 @@ func (s *SandboxService) Create(ctx context.Context, p SandboxCreateParams) (db.
TemplateID: templateID,
TemplateTeamID: templateTeamID,
Metadata: []byte("{}"),
}); err != nil {
})
if err != nil {
return db.Sandbox{}, fmt.Errorf("insert sandbox: %w", err)
}
resp, err := agent.CreateSandbox(ctx, connect.NewRequest(&pb.CreateSandboxRequest{
go s.createInBackground(sandboxID, sandboxIDStr, hostIDStr, agent, p, templateTeamID, templateID, templateDefaultUser, templateDefaultEnv)
return sb, nil
}
func (s *SandboxService) createInBackground(
sandboxID pgtype.UUID, sandboxIDStr, hostIDStr string,
agent hostagentClient, p SandboxCreateParams,
templateTeamID, templateID pgtype.UUID,
defaultUser string, defaultEnv map[string]string,
) {
bgCtx, cancel := context.WithTimeout(context.Background(), 10*time.Minute)
defer cancel()
resp, err := agent.CreateSandbox(bgCtx, connect.NewRequest(&pb.CreateSandboxRequest{
SandboxId: sandboxIDStr,
Template: p.Template,
TeamId: id.UUIDString(templateTeamID),
@ -157,45 +192,52 @@ func (s *SandboxService) Create(ctx context.Context, p SandboxCreateParams) (db.
MemoryMb: p.MemoryMB,
TimeoutSec: p.TimeoutSec,
DiskSizeMb: p.DiskSizeMB,
DefaultUser: templateDefaultUser,
DefaultEnv: templateDefaultEnv,
DefaultUser: defaultUser,
DefaultEnv: defaultEnv,
}))
if err != nil {
if _, dbErr := s.DB.UpdateSandboxStatus(ctx, db.UpdateSandboxStatusParams{
ID: sandboxID, Status: "error",
slog.Warn("background create failed", "sandbox_id", sandboxIDStr, "error", err)
errCtx, errCancel := context.WithTimeout(context.Background(), 10*time.Second)
defer errCancel()
if _, dbErr := s.DB.UpdateSandboxStatusIf(errCtx, db.UpdateSandboxStatusIfParams{
ID: sandboxID, Status: "starting", Status_2: "error",
}); dbErr != nil {
slog.Warn("failed to update sandbox status to error", "id", sandboxIDStr, "error", dbErr)
slog.Warn("failed to update sandbox to error after create failure", "id", sandboxIDStr, "error", dbErr)
}
return db.Sandbox{}, fmt.Errorf("agent create: %w", err)
s.publishEvent(errCtx, SandboxStateEvent{
Event: "sandbox.failed", SandboxID: sandboxIDStr, HostID: hostIDStr,
Error: err.Error(), Timestamp: time.Now().Unix(),
})
return
}
now := time.Now()
sb, err := s.DB.UpdateSandboxRunning(ctx, db.UpdateSandboxRunningParams{
ID: sandboxID,
HostIp: resp.Msg.HostIp,
GuestIp: "",
if _, dbErr := s.DB.UpdateSandboxRunningIf(bgCtx, db.UpdateSandboxRunningIfParams{
ID: sandboxID,
Status: "starting",
HostIp: resp.Msg.HostIp,
StartedAt: pgtype.Timestamptz{
Time: now,
Valid: true,
},
})
if err != nil {
return db.Sandbox{}, fmt.Errorf("update sandbox running: %w", err)
}); dbErr != nil {
slog.Warn("failed to update sandbox running after create", "id", sandboxIDStr, "error", dbErr)
}
// Store runtime metadata from the agent (envd/kernel/firecracker/agent versions).
if meta := resp.Msg.Metadata; len(meta) > 0 {
metaJSON, _ := json.Marshal(meta)
if err := s.DB.UpdateSandboxMetadata(ctx, db.UpdateSandboxMetadataParams{
ID: sandboxID,
Metadata: metaJSON,
if err := s.DB.UpdateSandboxMetadata(bgCtx, db.UpdateSandboxMetadataParams{
ID: sandboxID, Metadata: metaJSON,
}); err != nil {
slog.Warn("failed to store sandbox metadata", "id", sandboxIDStr, "error", err)
}
sb.Metadata = metaJSON
}
return sb, nil
s.publishEvent(bgCtx, SandboxStateEvent{
Event: "sandbox.started", SandboxID: sandboxIDStr, HostID: hostIDStr,
HostIP: resp.Msg.HostIp, Metadata: resp.Msg.Metadata,
Timestamp: now.Unix(),
})
}
// List returns active sandboxes (excludes stopped/error) belonging to the given team.
@ -208,7 +250,9 @@ func (s *SandboxService) Get(ctx context.Context, sandboxID, teamID pgtype.UUID)
return s.DB.GetSandboxByTeam(ctx, db.GetSandboxByTeamParams{ID: sandboxID, TeamID: teamID})
}
// Pause snapshots and freezes a running sandbox to disk.
// Pause snapshots and freezes a running sandbox to disk asynchronously.
// Pre-marks the DB status as "pausing" and fires the agent RPC in a
// background goroutine.
func (s *SandboxService) Pause(ctx context.Context, sandboxID, teamID pgtype.UUID) (db.Sandbox, error) {
sb, err := s.DB.GetSandboxByTeam(ctx, db.GetSandboxByTeamParams{ID: sandboxID, TeamID: teamID})
if err != nil {
@ -224,25 +268,29 @@ func (s *SandboxService) Pause(ctx context.Context, sandboxID, teamID pgtype.UUI
}
sandboxIDStr := id.FormatSandboxID(sandboxID)
hostIDStr := id.FormatHostID(sb.HostID)
// Pre-mark as "paused" in DB before the RPC so the reconciler does not
// mark the sandbox "stopped" while the host agent processes the pause.
if _, err := s.DB.UpdateSandboxStatus(ctx, db.UpdateSandboxStatusParams{
ID: sandboxID, Status: "paused",
}); err != nil {
return db.Sandbox{}, fmt.Errorf("pre-mark paused: %w", err)
sb, err = s.DB.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{
ID: sandboxID, Status: "running", Status_2: "pausing",
})
if err != nil {
return db.Sandbox{}, fmt.Errorf("sandbox status changed concurrently")
}
// Flush all metrics tiers before pausing so data survives in DB.
s.flushAndPersistMetrics(ctx, agent, sandboxID, true)
go s.pauseInBackground(sandboxID, sandboxIDStr, hostIDStr, agent)
if _, err := agent.PauseSandbox(ctx, connect.NewRequest(&pb.PauseSandboxRequest{
return sb, nil
}
func (s *SandboxService) pauseInBackground(sandboxID pgtype.UUID, sandboxIDStr, hostIDStr string, agent hostagentClient) {
bgCtx, cancel := context.WithTimeout(context.Background(), 10*time.Minute)
defer cancel()
s.flushAndPersistMetrics(bgCtx, agent, sandboxID, true)
if _, err := agent.PauseSandbox(bgCtx, connect.NewRequest(&pb.PauseSandboxRequest{
SandboxId: sandboxIDStr,
})); err != nil {
// Check if the agent still has this sandbox. If it was destroyed
// (e.g. frozen VM couldn't be resumed), mark as "error" instead of
// reverting to "running" — which would create a ghost record.
// Use a fresh context since the original ctx may already be expired.
revertStatus := "running"
pingCtx, pingCancel := context.WithTimeout(context.Background(), 10*time.Second)
if _, pingErr := agent.PingSandbox(pingCtx, connect.NewRequest(&pb.PingSandboxRequest{
@ -253,23 +301,37 @@ func (s *SandboxService) Pause(ctx context.Context, sandboxID, teamID pgtype.UUI
}
pingCancel()
dbCtx, dbCancel := context.WithTimeout(context.Background(), 5*time.Second)
if _, dbErr := s.DB.UpdateSandboxStatus(dbCtx, db.UpdateSandboxStatusParams{
ID: sandboxID, Status: revertStatus,
if _, dbErr := s.DB.UpdateSandboxStatusIf(dbCtx, db.UpdateSandboxStatusIfParams{
ID: sandboxID, Status: "pausing", Status_2: revertStatus,
}); dbErr != nil {
slog.Warn("failed to revert sandbox status after pause error", "sandbox_id", sandboxIDStr, "error", dbErr)
}
dbCancel()
return db.Sandbox{}, fmt.Errorf("agent pause: %w", err)
evtCtx, evtCancel := context.WithTimeout(context.Background(), 5*time.Second)
s.publishEvent(evtCtx, SandboxStateEvent{
Event: "sandbox.failed", SandboxID: sandboxIDStr, HostID: hostIDStr,
Error: err.Error(), Timestamp: time.Now().Unix(),
})
evtCancel()
return
}
sb, err = s.DB.GetSandbox(ctx, sandboxID)
if err != nil {
return db.Sandbox{}, fmt.Errorf("get sandbox after pause: %w", err)
if _, err := s.DB.UpdateSandboxStatusIf(bgCtx, db.UpdateSandboxStatusIfParams{
ID: sandboxID, Status: "pausing", Status_2: "paused",
}); err != nil {
slog.Warn("failed to update sandbox to paused", "sandbox_id", sandboxIDStr, "error", err)
}
return sb, nil
s.publishEvent(bgCtx, SandboxStateEvent{
Event: "sandbox.paused", SandboxID: sandboxIDStr, HostID: hostIDStr,
Timestamp: time.Now().Unix(),
})
}
// Resume restores a paused sandbox from snapshot.
// Resume restores a paused sandbox from snapshot asynchronously.
// Pre-marks the DB status as "resuming" and fires the agent RPC in a
// background goroutine.
func (s *SandboxService) Resume(ctx context.Context, sandboxID, teamID pgtype.UUID) (db.Sandbox, error) {
sb, err := s.DB.GetSandboxByTeam(ctx, db.GetSandboxByTeamParams{ID: sandboxID, TeamID: teamID})
if err != nil {
@ -285,8 +347,8 @@ func (s *SandboxService) Resume(ctx context.Context, sandboxID, teamID pgtype.UU
}
sandboxIDStr := id.FormatSandboxID(sandboxID)
hostIDStr := id.FormatHostID(sb.HostID)
// Look up template defaults for resume.
var resumeDefaultUser string
var resumeDefaultEnv map[string]string
if sb.TemplateID.Valid {
@ -299,7 +361,6 @@ func (s *SandboxService) Resume(ctx context.Context, sandboxID, teamID pgtype.UU
}
}
// Extract kernel version hint from existing sandbox metadata.
var kernelVersion string
if len(sb.Metadata) > 0 {
var meta map[string]string
@ -308,52 +369,88 @@ func (s *SandboxService) Resume(ctx context.Context, sandboxID, teamID pgtype.UU
}
}
resp, err := agent.ResumeSandbox(ctx, connect.NewRequest(&pb.ResumeSandboxRequest{
SandboxId: sandboxIDStr,
TimeoutSec: sb.TimeoutSec,
DefaultUser: resumeDefaultUser,
DefaultEnv: resumeDefaultEnv,
KernelVersion: kernelVersion,
}))
if err != nil {
return db.Sandbox{}, fmt.Errorf("agent resume: %w", err)
}
now := time.Now()
sb, err = s.DB.UpdateSandboxRunning(ctx, db.UpdateSandboxRunningParams{
ID: sandboxID,
HostIp: resp.Msg.HostIp,
GuestIp: "",
StartedAt: pgtype.Timestamptz{
Time: now,
Valid: true,
},
sb, err = s.DB.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{
ID: sandboxID, Status: "paused", Status_2: "resuming",
})
if err != nil {
return db.Sandbox{}, fmt.Errorf("update status: %w", err)
return db.Sandbox{}, fmt.Errorf("sandbox status changed concurrently")
}
// Update metadata with actual versions used after resume.
if meta := resp.Msg.Metadata; len(meta) > 0 {
metaJSON, _ := json.Marshal(meta)
if err := s.DB.UpdateSandboxMetadata(ctx, db.UpdateSandboxMetadataParams{
ID: sandboxID,
Metadata: metaJSON,
}); err != nil {
slog.Warn("failed to update sandbox metadata after resume", "id", sandboxIDStr, "error", err)
}
sb.Metadata = metaJSON
}
go s.resumeInBackground(sandboxID, sandboxIDStr, hostIDStr, agent, sb.TimeoutSec, resumeDefaultUser, resumeDefaultEnv, kernelVersion)
return sb, nil
}
// Destroy stops a sandbox and marks it as stopped.
func (s *SandboxService) resumeInBackground(
sandboxID pgtype.UUID, sandboxIDStr, hostIDStr string,
agent hostagentClient, timeoutSec int32,
defaultUser string, defaultEnv map[string]string, kernelVersion string,
) {
bgCtx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
defer cancel()
resp, err := agent.ResumeSandbox(bgCtx, connect.NewRequest(&pb.ResumeSandboxRequest{
SandboxId: sandboxIDStr,
TimeoutSec: timeoutSec,
DefaultUser: defaultUser,
DefaultEnv: defaultEnv,
KernelVersion: kernelVersion,
}))
if err != nil {
slog.Warn("background resume failed", "sandbox_id", sandboxIDStr, "error", err)
errCtx, errCancel := context.WithTimeout(context.Background(), 10*time.Second)
defer errCancel()
if _, dbErr := s.DB.UpdateSandboxStatusIf(errCtx, db.UpdateSandboxStatusIfParams{
ID: sandboxID, Status: "resuming", Status_2: "paused",
}); dbErr != nil {
slog.Warn("failed to revert sandbox to paused after resume failure", "id", sandboxIDStr, "error", dbErr)
}
s.publishEvent(errCtx, SandboxStateEvent{
Event: "sandbox.failed", SandboxID: sandboxIDStr, HostID: hostIDStr,
Error: err.Error(), Timestamp: time.Now().Unix(),
})
return
}
now := time.Now()
if _, dbErr := s.DB.UpdateSandboxRunningIf(bgCtx, db.UpdateSandboxRunningIfParams{
ID: sandboxID,
Status: "resuming",
HostIp: resp.Msg.HostIp,
StartedAt: pgtype.Timestamptz{
Time: now,
Valid: true,
},
}); dbErr != nil {
slog.Warn("failed to update sandbox running after resume", "id", sandboxIDStr, "error", dbErr)
}
if meta := resp.Msg.Metadata; len(meta) > 0 {
metaJSON, _ := json.Marshal(meta)
if err := s.DB.UpdateSandboxMetadata(bgCtx, db.UpdateSandboxMetadataParams{
ID: sandboxID, Metadata: metaJSON,
}); err != nil {
slog.Warn("failed to update sandbox metadata after resume", "id", sandboxIDStr, "error", err)
}
}
s.publishEvent(bgCtx, SandboxStateEvent{
Event: "sandbox.resumed", SandboxID: sandboxIDStr, HostID: hostIDStr,
HostIP: resp.Msg.HostIp, Metadata: resp.Msg.Metadata,
Timestamp: now.Unix(),
})
}
// Destroy stops a sandbox asynchronously. Pre-marks the DB status as
// "stopping" and fires the agent RPC in a background goroutine.
func (s *SandboxService) Destroy(ctx context.Context, sandboxID, teamID pgtype.UUID) error {
sb, err := s.DB.GetSandboxByTeam(ctx, db.GetSandboxByTeamParams{ID: sandboxID, TeamID: teamID})
if err != nil {
return fmt.Errorf("sandbox not found: %w", err)
}
if sb.Status == "stopped" || sb.Status == "error" {
return nil
}
agent, _, err := s.agentForSandbox(ctx, sandboxID)
if err != nil {
@ -361,35 +458,53 @@ func (s *SandboxService) Destroy(ctx context.Context, sandboxID, teamID pgtype.U
}
sandboxIDStr := id.FormatSandboxID(sandboxID)
hostIDStr := id.FormatHostID(sb.HostID)
prevStatus := sb.Status
// If running, flush 24h tier metrics for analytics before destroying.
if sb.Status == "running" {
s.flushAndPersistMetrics(ctx, agent, sandboxID, false)
if _, err := s.DB.UpdateSandboxStatus(ctx, db.UpdateSandboxStatusParams{
ID: sandboxID, Status: "stopping",
}); err != nil {
return fmt.Errorf("pre-mark stopping: %w", err)
}
// Destroy on host agent. A not-found response is fine — sandbox is already gone.
if _, err := agent.DestroySandbox(ctx, connect.NewRequest(&pb.DestroySandboxRequest{
go s.destroyInBackground(sandboxID, sandboxIDStr, hostIDStr, agent, prevStatus)
return nil
}
func (s *SandboxService) destroyInBackground(sandboxID pgtype.UUID, sandboxIDStr, hostIDStr string, agent hostagentClient, prevStatus string) {
bgCtx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
defer cancel()
if prevStatus == "running" || prevStatus == "pausing" {
s.flushAndPersistMetrics(bgCtx, agent, sandboxID, false)
}
if _, err := agent.DestroySandbox(bgCtx, connect.NewRequest(&pb.DestroySandboxRequest{
SandboxId: sandboxIDStr,
})); err != nil && connect.CodeOf(err) != connect.CodeNotFound {
return fmt.Errorf("agent destroy: %w", err)
slog.Warn("background destroy failed", "sandbox_id", sandboxIDStr, "error", err)
}
// For a paused sandbox, only keep 24h tier; remove the finer-grained tiers.
if sb.Status == "paused" {
_ = s.DB.DeleteSandboxMetricPointsByTier(ctx, db.DeleteSandboxMetricPointsByTierParams{
if prevStatus == "paused" {
_ = s.DB.DeleteSandboxMetricPointsByTier(bgCtx, db.DeleteSandboxMetricPointsByTierParams{
SandboxID: sandboxID, Tier: "10m",
})
_ = s.DB.DeleteSandboxMetricPointsByTier(ctx, db.DeleteSandboxMetricPointsByTierParams{
_ = s.DB.DeleteSandboxMetricPointsByTier(bgCtx, db.DeleteSandboxMetricPointsByTierParams{
SandboxID: sandboxID, Tier: "2h",
})
}
if _, err := s.DB.UpdateSandboxStatus(ctx, db.UpdateSandboxStatusParams{
ID: sandboxID, Status: "stopped",
if _, err := s.DB.UpdateSandboxStatusIf(bgCtx, db.UpdateSandboxStatusIfParams{
ID: sandboxID, Status: "stopping", Status_2: "stopped",
}); err != nil {
return fmt.Errorf("update status: %w", err)
slog.Warn("failed to update sandbox to stopped", "sandbox_id", sandboxIDStr, "error", err)
}
return nil
s.publishEvent(bgCtx, SandboxStateEvent{
Event: "sandbox.stopped", SandboxID: sandboxIDStr, HostID: hostIDStr,
Timestamp: time.Now().Unix(),
})
}
// flushAndPersistMetrics calls FlushSandboxMetrics on the agent and stores