forked from wrenn/wrenn
feat: async sandbox lifecycle with Redis Stream events
Replace synchronous RPC-based CP-host communication for sandbox lifecycle operations (Create, Pause, Resume, Destroy) with an async pattern. CP handlers now return 202 Accepted immediately, fire agent RPCs in background goroutines, and publish state events to a Redis Stream. A background consumer processes events as a fallback writer. Agent-side auto-pause events are pushed to the CP via HTTP callback (POST /v1/hosts/sandbox-events), keeping Redis internal to the CP. All DB status transitions use conditional updates (UpdateSandboxStatusIf, UpdateSandboxRunningIf) to prevent race conditions between concurrent operations and background goroutines. The HostMonitor reconciler is kept at 60s as a safety net, extended to handle transient statuses (starting, pausing, resuming, stopping). Frontend updated to handle 202 responses with empty bodies and render transient statuses with blue indicators.
This commit is contained in:
@ -18,12 +18,27 @@ import (
|
||||
pb "git.omukk.dev/wrenn/wrenn/proto/hostagent/gen"
|
||||
)
|
||||
|
||||
// SandboxEventPublisher writes sandbox lifecycle events to the Redis stream.
|
||||
type SandboxEventPublisher func(ctx context.Context, event SandboxStateEvent)
|
||||
|
||||
// SandboxStateEvent is the event payload published to the Redis stream.
|
||||
type SandboxStateEvent struct {
|
||||
Event string `json:"event"`
|
||||
SandboxID string `json:"sandbox_id"`
|
||||
HostID string `json:"host_id"`
|
||||
HostIP string `json:"host_ip,omitempty"`
|
||||
Metadata map[string]string `json:"metadata,omitempty"`
|
||||
Error string `json:"error,omitempty"`
|
||||
Timestamp int64 `json:"timestamp"`
|
||||
}
|
||||
|
||||
// SandboxService provides sandbox lifecycle operations shared between the
|
||||
// REST API and the dashboard.
|
||||
type SandboxService struct {
|
||||
DB *db.Queries
|
||||
Pool *lifecycle.HostClientPool
|
||||
Scheduler scheduler.HostScheduler
|
||||
DB *db.Queries
|
||||
Pool *lifecycle.HostClientPool
|
||||
Scheduler scheduler.HostScheduler
|
||||
PublishEvent SandboxEventPublisher
|
||||
}
|
||||
|
||||
// SandboxCreateParams holds the parameters for creating a sandbox.
|
||||
@ -53,6 +68,12 @@ func (s *SandboxService) agentForSandbox(ctx context.Context, sandboxID pgtype.U
|
||||
return agent, sb, nil
|
||||
}
|
||||
|
||||
func (s *SandboxService) publishEvent(ctx context.Context, event SandboxStateEvent) {
|
||||
if s.PublishEvent != nil {
|
||||
s.PublishEvent(ctx, event)
|
||||
}
|
||||
}
|
||||
|
||||
// hostagentClient is a local alias to avoid the full package path in signatures.
|
||||
type hostagentClient = interface {
|
||||
CreateSandbox(ctx context.Context, req *connect.Request[pb.CreateSandboxRequest]) (*connect.Response[pb.CreateSandboxResponse], error)
|
||||
@ -64,8 +85,10 @@ type hostagentClient = interface {
|
||||
FlushSandboxMetrics(ctx context.Context, req *connect.Request[pb.FlushSandboxMetricsRequest]) (*connect.Response[pb.FlushSandboxMetricsResponse], error)
|
||||
}
|
||||
|
||||
// Create creates a new sandbox: picks a host via the scheduler, inserts a pending
|
||||
// DB record, calls the host agent, and updates the record to running.
|
||||
// Create creates a new sandbox asynchronously: picks a host, inserts a
|
||||
// "starting" DB record, fires the agent RPC in a background goroutine, and
|
||||
// returns the sandbox immediately. The background goroutine publishes a
|
||||
// sandbox event to the Redis stream when the operation completes.
|
||||
func (s *SandboxService) Create(ctx context.Context, p SandboxCreateParams) (db.Sandbox, error) {
|
||||
if p.Template == "" {
|
||||
p.Template = "minimal"
|
||||
@ -96,11 +119,9 @@ func (s *SandboxService) Create(ctx context.Context, p SandboxCreateParams) (db.
|
||||
templateTeamID = tmpl.TeamID
|
||||
templateID = tmpl.ID
|
||||
templateDefaultUser = tmpl.DefaultUser
|
||||
// Parse default_env JSONB into a map.
|
||||
if len(tmpl.DefaultEnv) > 0 {
|
||||
_ = json.Unmarshal(tmpl.DefaultEnv, &templateDefaultEnv)
|
||||
}
|
||||
// If the template is a snapshot, use its baked-in vcpus/memory.
|
||||
if tmpl.Type == "snapshot" {
|
||||
p.VCPUs = tmpl.Vcpus
|
||||
p.MemoryMB = tmpl.MemoryMb
|
||||
@ -111,13 +132,11 @@ func (s *SandboxService) Create(ctx context.Context, p SandboxCreateParams) (db.
|
||||
return db.Sandbox{}, fmt.Errorf("invalid request: team_id is required")
|
||||
}
|
||||
|
||||
// Determine whether this team uses BYOC hosts or platform hosts.
|
||||
team, err := s.DB.GetTeam(ctx, p.TeamID)
|
||||
if err != nil {
|
||||
return db.Sandbox{}, fmt.Errorf("team not found: %w", err)
|
||||
}
|
||||
|
||||
// Pick a host for this sandbox.
|
||||
host, err := s.Scheduler.SelectHost(ctx, p.TeamID, team.IsByoc, p.MemoryMB, p.DiskSizeMB)
|
||||
if err != nil {
|
||||
return db.Sandbox{}, fmt.Errorf("select host: %w", err)
|
||||
@ -130,13 +149,14 @@ func (s *SandboxService) Create(ctx context.Context, p SandboxCreateParams) (db.
|
||||
|
||||
sandboxID := id.NewSandboxID()
|
||||
sandboxIDStr := id.FormatSandboxID(sandboxID)
|
||||
hostIDStr := id.FormatHostID(host.ID)
|
||||
|
||||
if _, err := s.DB.InsertSandbox(ctx, db.InsertSandboxParams{
|
||||
sb, err := s.DB.InsertSandbox(ctx, db.InsertSandboxParams{
|
||||
ID: sandboxID,
|
||||
TeamID: p.TeamID,
|
||||
HostID: host.ID,
|
||||
Template: p.Template,
|
||||
Status: "pending",
|
||||
Status: "starting",
|
||||
Vcpus: p.VCPUs,
|
||||
MemoryMb: p.MemoryMB,
|
||||
TimeoutSec: p.TimeoutSec,
|
||||
@ -144,11 +164,26 @@ func (s *SandboxService) Create(ctx context.Context, p SandboxCreateParams) (db.
|
||||
TemplateID: templateID,
|
||||
TemplateTeamID: templateTeamID,
|
||||
Metadata: []byte("{}"),
|
||||
}); err != nil {
|
||||
})
|
||||
if err != nil {
|
||||
return db.Sandbox{}, fmt.Errorf("insert sandbox: %w", err)
|
||||
}
|
||||
|
||||
resp, err := agent.CreateSandbox(ctx, connect.NewRequest(&pb.CreateSandboxRequest{
|
||||
go s.createInBackground(sandboxID, sandboxIDStr, hostIDStr, agent, p, templateTeamID, templateID, templateDefaultUser, templateDefaultEnv)
|
||||
|
||||
return sb, nil
|
||||
}
|
||||
|
||||
func (s *SandboxService) createInBackground(
|
||||
sandboxID pgtype.UUID, sandboxIDStr, hostIDStr string,
|
||||
agent hostagentClient, p SandboxCreateParams,
|
||||
templateTeamID, templateID pgtype.UUID,
|
||||
defaultUser string, defaultEnv map[string]string,
|
||||
) {
|
||||
bgCtx, cancel := context.WithTimeout(context.Background(), 10*time.Minute)
|
||||
defer cancel()
|
||||
|
||||
resp, err := agent.CreateSandbox(bgCtx, connect.NewRequest(&pb.CreateSandboxRequest{
|
||||
SandboxId: sandboxIDStr,
|
||||
Template: p.Template,
|
||||
TeamId: id.UUIDString(templateTeamID),
|
||||
@ -157,45 +192,52 @@ func (s *SandboxService) Create(ctx context.Context, p SandboxCreateParams) (db.
|
||||
MemoryMb: p.MemoryMB,
|
||||
TimeoutSec: p.TimeoutSec,
|
||||
DiskSizeMb: p.DiskSizeMB,
|
||||
DefaultUser: templateDefaultUser,
|
||||
DefaultEnv: templateDefaultEnv,
|
||||
DefaultUser: defaultUser,
|
||||
DefaultEnv: defaultEnv,
|
||||
}))
|
||||
if err != nil {
|
||||
if _, dbErr := s.DB.UpdateSandboxStatus(ctx, db.UpdateSandboxStatusParams{
|
||||
ID: sandboxID, Status: "error",
|
||||
slog.Warn("background create failed", "sandbox_id", sandboxIDStr, "error", err)
|
||||
errCtx, errCancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer errCancel()
|
||||
if _, dbErr := s.DB.UpdateSandboxStatusIf(errCtx, db.UpdateSandboxStatusIfParams{
|
||||
ID: sandboxID, Status: "starting", Status_2: "error",
|
||||
}); dbErr != nil {
|
||||
slog.Warn("failed to update sandbox status to error", "id", sandboxIDStr, "error", dbErr)
|
||||
slog.Warn("failed to update sandbox to error after create failure", "id", sandboxIDStr, "error", dbErr)
|
||||
}
|
||||
return db.Sandbox{}, fmt.Errorf("agent create: %w", err)
|
||||
s.publishEvent(errCtx, SandboxStateEvent{
|
||||
Event: "sandbox.failed", SandboxID: sandboxIDStr, HostID: hostIDStr,
|
||||
Error: err.Error(), Timestamp: time.Now().Unix(),
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
now := time.Now()
|
||||
sb, err := s.DB.UpdateSandboxRunning(ctx, db.UpdateSandboxRunningParams{
|
||||
ID: sandboxID,
|
||||
HostIp: resp.Msg.HostIp,
|
||||
GuestIp: "",
|
||||
if _, dbErr := s.DB.UpdateSandboxRunningIf(bgCtx, db.UpdateSandboxRunningIfParams{
|
||||
ID: sandboxID,
|
||||
Status: "starting",
|
||||
HostIp: resp.Msg.HostIp,
|
||||
StartedAt: pgtype.Timestamptz{
|
||||
Time: now,
|
||||
Valid: true,
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
return db.Sandbox{}, fmt.Errorf("update sandbox running: %w", err)
|
||||
}); dbErr != nil {
|
||||
slog.Warn("failed to update sandbox running after create", "id", sandboxIDStr, "error", dbErr)
|
||||
}
|
||||
|
||||
// Store runtime metadata from the agent (envd/kernel/firecracker/agent versions).
|
||||
if meta := resp.Msg.Metadata; len(meta) > 0 {
|
||||
metaJSON, _ := json.Marshal(meta)
|
||||
if err := s.DB.UpdateSandboxMetadata(ctx, db.UpdateSandboxMetadataParams{
|
||||
ID: sandboxID,
|
||||
Metadata: metaJSON,
|
||||
if err := s.DB.UpdateSandboxMetadata(bgCtx, db.UpdateSandboxMetadataParams{
|
||||
ID: sandboxID, Metadata: metaJSON,
|
||||
}); err != nil {
|
||||
slog.Warn("failed to store sandbox metadata", "id", sandboxIDStr, "error", err)
|
||||
}
|
||||
sb.Metadata = metaJSON
|
||||
}
|
||||
|
||||
return sb, nil
|
||||
s.publishEvent(bgCtx, SandboxStateEvent{
|
||||
Event: "sandbox.started", SandboxID: sandboxIDStr, HostID: hostIDStr,
|
||||
HostIP: resp.Msg.HostIp, Metadata: resp.Msg.Metadata,
|
||||
Timestamp: now.Unix(),
|
||||
})
|
||||
}
|
||||
|
||||
// List returns active sandboxes (excludes stopped/error) belonging to the given team.
|
||||
@ -208,7 +250,9 @@ func (s *SandboxService) Get(ctx context.Context, sandboxID, teamID pgtype.UUID)
|
||||
return s.DB.GetSandboxByTeam(ctx, db.GetSandboxByTeamParams{ID: sandboxID, TeamID: teamID})
|
||||
}
|
||||
|
||||
// Pause snapshots and freezes a running sandbox to disk.
|
||||
// Pause snapshots and freezes a running sandbox to disk asynchronously.
|
||||
// Pre-marks the DB status as "pausing" and fires the agent RPC in a
|
||||
// background goroutine.
|
||||
func (s *SandboxService) Pause(ctx context.Context, sandboxID, teamID pgtype.UUID) (db.Sandbox, error) {
|
||||
sb, err := s.DB.GetSandboxByTeam(ctx, db.GetSandboxByTeamParams{ID: sandboxID, TeamID: teamID})
|
||||
if err != nil {
|
||||
@ -224,25 +268,29 @@ func (s *SandboxService) Pause(ctx context.Context, sandboxID, teamID pgtype.UUI
|
||||
}
|
||||
|
||||
sandboxIDStr := id.FormatSandboxID(sandboxID)
|
||||
hostIDStr := id.FormatHostID(sb.HostID)
|
||||
|
||||
// Pre-mark as "paused" in DB before the RPC so the reconciler does not
|
||||
// mark the sandbox "stopped" while the host agent processes the pause.
|
||||
if _, err := s.DB.UpdateSandboxStatus(ctx, db.UpdateSandboxStatusParams{
|
||||
ID: sandboxID, Status: "paused",
|
||||
}); err != nil {
|
||||
return db.Sandbox{}, fmt.Errorf("pre-mark paused: %w", err)
|
||||
sb, err = s.DB.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{
|
||||
ID: sandboxID, Status: "running", Status_2: "pausing",
|
||||
})
|
||||
if err != nil {
|
||||
return db.Sandbox{}, fmt.Errorf("sandbox status changed concurrently")
|
||||
}
|
||||
|
||||
// Flush all metrics tiers before pausing so data survives in DB.
|
||||
s.flushAndPersistMetrics(ctx, agent, sandboxID, true)
|
||||
go s.pauseInBackground(sandboxID, sandboxIDStr, hostIDStr, agent)
|
||||
|
||||
if _, err := agent.PauseSandbox(ctx, connect.NewRequest(&pb.PauseSandboxRequest{
|
||||
return sb, nil
|
||||
}
|
||||
|
||||
func (s *SandboxService) pauseInBackground(sandboxID pgtype.UUID, sandboxIDStr, hostIDStr string, agent hostagentClient) {
|
||||
bgCtx, cancel := context.WithTimeout(context.Background(), 10*time.Minute)
|
||||
defer cancel()
|
||||
|
||||
s.flushAndPersistMetrics(bgCtx, agent, sandboxID, true)
|
||||
|
||||
if _, err := agent.PauseSandbox(bgCtx, connect.NewRequest(&pb.PauseSandboxRequest{
|
||||
SandboxId: sandboxIDStr,
|
||||
})); err != nil {
|
||||
// Check if the agent still has this sandbox. If it was destroyed
|
||||
// (e.g. frozen VM couldn't be resumed), mark as "error" instead of
|
||||
// reverting to "running" — which would create a ghost record.
|
||||
// Use a fresh context since the original ctx may already be expired.
|
||||
revertStatus := "running"
|
||||
pingCtx, pingCancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
if _, pingErr := agent.PingSandbox(pingCtx, connect.NewRequest(&pb.PingSandboxRequest{
|
||||
@ -253,23 +301,37 @@ func (s *SandboxService) Pause(ctx context.Context, sandboxID, teamID pgtype.UUI
|
||||
}
|
||||
pingCancel()
|
||||
dbCtx, dbCancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
if _, dbErr := s.DB.UpdateSandboxStatus(dbCtx, db.UpdateSandboxStatusParams{
|
||||
ID: sandboxID, Status: revertStatus,
|
||||
if _, dbErr := s.DB.UpdateSandboxStatusIf(dbCtx, db.UpdateSandboxStatusIfParams{
|
||||
ID: sandboxID, Status: "pausing", Status_2: revertStatus,
|
||||
}); dbErr != nil {
|
||||
slog.Warn("failed to revert sandbox status after pause error", "sandbox_id", sandboxIDStr, "error", dbErr)
|
||||
}
|
||||
dbCancel()
|
||||
return db.Sandbox{}, fmt.Errorf("agent pause: %w", err)
|
||||
|
||||
evtCtx, evtCancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
s.publishEvent(evtCtx, SandboxStateEvent{
|
||||
Event: "sandbox.failed", SandboxID: sandboxIDStr, HostID: hostIDStr,
|
||||
Error: err.Error(), Timestamp: time.Now().Unix(),
|
||||
})
|
||||
evtCancel()
|
||||
return
|
||||
}
|
||||
|
||||
sb, err = s.DB.GetSandbox(ctx, sandboxID)
|
||||
if err != nil {
|
||||
return db.Sandbox{}, fmt.Errorf("get sandbox after pause: %w", err)
|
||||
if _, err := s.DB.UpdateSandboxStatusIf(bgCtx, db.UpdateSandboxStatusIfParams{
|
||||
ID: sandboxID, Status: "pausing", Status_2: "paused",
|
||||
}); err != nil {
|
||||
slog.Warn("failed to update sandbox to paused", "sandbox_id", sandboxIDStr, "error", err)
|
||||
}
|
||||
return sb, nil
|
||||
|
||||
s.publishEvent(bgCtx, SandboxStateEvent{
|
||||
Event: "sandbox.paused", SandboxID: sandboxIDStr, HostID: hostIDStr,
|
||||
Timestamp: time.Now().Unix(),
|
||||
})
|
||||
}
|
||||
|
||||
// Resume restores a paused sandbox from snapshot.
|
||||
// Resume restores a paused sandbox from snapshot asynchronously.
|
||||
// Pre-marks the DB status as "resuming" and fires the agent RPC in a
|
||||
// background goroutine.
|
||||
func (s *SandboxService) Resume(ctx context.Context, sandboxID, teamID pgtype.UUID) (db.Sandbox, error) {
|
||||
sb, err := s.DB.GetSandboxByTeam(ctx, db.GetSandboxByTeamParams{ID: sandboxID, TeamID: teamID})
|
||||
if err != nil {
|
||||
@ -285,8 +347,8 @@ func (s *SandboxService) Resume(ctx context.Context, sandboxID, teamID pgtype.UU
|
||||
}
|
||||
|
||||
sandboxIDStr := id.FormatSandboxID(sandboxID)
|
||||
hostIDStr := id.FormatHostID(sb.HostID)
|
||||
|
||||
// Look up template defaults for resume.
|
||||
var resumeDefaultUser string
|
||||
var resumeDefaultEnv map[string]string
|
||||
if sb.TemplateID.Valid {
|
||||
@ -299,7 +361,6 @@ func (s *SandboxService) Resume(ctx context.Context, sandboxID, teamID pgtype.UU
|
||||
}
|
||||
}
|
||||
|
||||
// Extract kernel version hint from existing sandbox metadata.
|
||||
var kernelVersion string
|
||||
if len(sb.Metadata) > 0 {
|
||||
var meta map[string]string
|
||||
@ -308,52 +369,88 @@ func (s *SandboxService) Resume(ctx context.Context, sandboxID, teamID pgtype.UU
|
||||
}
|
||||
}
|
||||
|
||||
resp, err := agent.ResumeSandbox(ctx, connect.NewRequest(&pb.ResumeSandboxRequest{
|
||||
SandboxId: sandboxIDStr,
|
||||
TimeoutSec: sb.TimeoutSec,
|
||||
DefaultUser: resumeDefaultUser,
|
||||
DefaultEnv: resumeDefaultEnv,
|
||||
KernelVersion: kernelVersion,
|
||||
}))
|
||||
if err != nil {
|
||||
return db.Sandbox{}, fmt.Errorf("agent resume: %w", err)
|
||||
}
|
||||
|
||||
now := time.Now()
|
||||
sb, err = s.DB.UpdateSandboxRunning(ctx, db.UpdateSandboxRunningParams{
|
||||
ID: sandboxID,
|
||||
HostIp: resp.Msg.HostIp,
|
||||
GuestIp: "",
|
||||
StartedAt: pgtype.Timestamptz{
|
||||
Time: now,
|
||||
Valid: true,
|
||||
},
|
||||
sb, err = s.DB.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{
|
||||
ID: sandboxID, Status: "paused", Status_2: "resuming",
|
||||
})
|
||||
if err != nil {
|
||||
return db.Sandbox{}, fmt.Errorf("update status: %w", err)
|
||||
return db.Sandbox{}, fmt.Errorf("sandbox status changed concurrently")
|
||||
}
|
||||
|
||||
// Update metadata with actual versions used after resume.
|
||||
if meta := resp.Msg.Metadata; len(meta) > 0 {
|
||||
metaJSON, _ := json.Marshal(meta)
|
||||
if err := s.DB.UpdateSandboxMetadata(ctx, db.UpdateSandboxMetadataParams{
|
||||
ID: sandboxID,
|
||||
Metadata: metaJSON,
|
||||
}); err != nil {
|
||||
slog.Warn("failed to update sandbox metadata after resume", "id", sandboxIDStr, "error", err)
|
||||
}
|
||||
sb.Metadata = metaJSON
|
||||
}
|
||||
go s.resumeInBackground(sandboxID, sandboxIDStr, hostIDStr, agent, sb.TimeoutSec, resumeDefaultUser, resumeDefaultEnv, kernelVersion)
|
||||
|
||||
return sb, nil
|
||||
}
|
||||
|
||||
// Destroy stops a sandbox and marks it as stopped.
|
||||
func (s *SandboxService) resumeInBackground(
|
||||
sandboxID pgtype.UUID, sandboxIDStr, hostIDStr string,
|
||||
agent hostagentClient, timeoutSec int32,
|
||||
defaultUser string, defaultEnv map[string]string, kernelVersion string,
|
||||
) {
|
||||
bgCtx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
|
||||
defer cancel()
|
||||
|
||||
resp, err := agent.ResumeSandbox(bgCtx, connect.NewRequest(&pb.ResumeSandboxRequest{
|
||||
SandboxId: sandboxIDStr,
|
||||
TimeoutSec: timeoutSec,
|
||||
DefaultUser: defaultUser,
|
||||
DefaultEnv: defaultEnv,
|
||||
KernelVersion: kernelVersion,
|
||||
}))
|
||||
if err != nil {
|
||||
slog.Warn("background resume failed", "sandbox_id", sandboxIDStr, "error", err)
|
||||
errCtx, errCancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer errCancel()
|
||||
if _, dbErr := s.DB.UpdateSandboxStatusIf(errCtx, db.UpdateSandboxStatusIfParams{
|
||||
ID: sandboxID, Status: "resuming", Status_2: "paused",
|
||||
}); dbErr != nil {
|
||||
slog.Warn("failed to revert sandbox to paused after resume failure", "id", sandboxIDStr, "error", dbErr)
|
||||
}
|
||||
s.publishEvent(errCtx, SandboxStateEvent{
|
||||
Event: "sandbox.failed", SandboxID: sandboxIDStr, HostID: hostIDStr,
|
||||
Error: err.Error(), Timestamp: time.Now().Unix(),
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
now := time.Now()
|
||||
if _, dbErr := s.DB.UpdateSandboxRunningIf(bgCtx, db.UpdateSandboxRunningIfParams{
|
||||
ID: sandboxID,
|
||||
Status: "resuming",
|
||||
HostIp: resp.Msg.HostIp,
|
||||
StartedAt: pgtype.Timestamptz{
|
||||
Time: now,
|
||||
Valid: true,
|
||||
},
|
||||
}); dbErr != nil {
|
||||
slog.Warn("failed to update sandbox running after resume", "id", sandboxIDStr, "error", dbErr)
|
||||
}
|
||||
|
||||
if meta := resp.Msg.Metadata; len(meta) > 0 {
|
||||
metaJSON, _ := json.Marshal(meta)
|
||||
if err := s.DB.UpdateSandboxMetadata(bgCtx, db.UpdateSandboxMetadataParams{
|
||||
ID: sandboxID, Metadata: metaJSON,
|
||||
}); err != nil {
|
||||
slog.Warn("failed to update sandbox metadata after resume", "id", sandboxIDStr, "error", err)
|
||||
}
|
||||
}
|
||||
|
||||
s.publishEvent(bgCtx, SandboxStateEvent{
|
||||
Event: "sandbox.resumed", SandboxID: sandboxIDStr, HostID: hostIDStr,
|
||||
HostIP: resp.Msg.HostIp, Metadata: resp.Msg.Metadata,
|
||||
Timestamp: now.Unix(),
|
||||
})
|
||||
}
|
||||
|
||||
// Destroy stops a sandbox asynchronously. Pre-marks the DB status as
|
||||
// "stopping" and fires the agent RPC in a background goroutine.
|
||||
func (s *SandboxService) Destroy(ctx context.Context, sandboxID, teamID pgtype.UUID) error {
|
||||
sb, err := s.DB.GetSandboxByTeam(ctx, db.GetSandboxByTeamParams{ID: sandboxID, TeamID: teamID})
|
||||
if err != nil {
|
||||
return fmt.Errorf("sandbox not found: %w", err)
|
||||
}
|
||||
if sb.Status == "stopped" || sb.Status == "error" {
|
||||
return nil
|
||||
}
|
||||
|
||||
agent, _, err := s.agentForSandbox(ctx, sandboxID)
|
||||
if err != nil {
|
||||
@ -361,35 +458,53 @@ func (s *SandboxService) Destroy(ctx context.Context, sandboxID, teamID pgtype.U
|
||||
}
|
||||
|
||||
sandboxIDStr := id.FormatSandboxID(sandboxID)
|
||||
hostIDStr := id.FormatHostID(sb.HostID)
|
||||
prevStatus := sb.Status
|
||||
|
||||
// If running, flush 24h tier metrics for analytics before destroying.
|
||||
if sb.Status == "running" {
|
||||
s.flushAndPersistMetrics(ctx, agent, sandboxID, false)
|
||||
if _, err := s.DB.UpdateSandboxStatus(ctx, db.UpdateSandboxStatusParams{
|
||||
ID: sandboxID, Status: "stopping",
|
||||
}); err != nil {
|
||||
return fmt.Errorf("pre-mark stopping: %w", err)
|
||||
}
|
||||
|
||||
// Destroy on host agent. A not-found response is fine — sandbox is already gone.
|
||||
if _, err := agent.DestroySandbox(ctx, connect.NewRequest(&pb.DestroySandboxRequest{
|
||||
go s.destroyInBackground(sandboxID, sandboxIDStr, hostIDStr, agent, prevStatus)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *SandboxService) destroyInBackground(sandboxID pgtype.UUID, sandboxIDStr, hostIDStr string, agent hostagentClient, prevStatus string) {
|
||||
bgCtx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
|
||||
defer cancel()
|
||||
|
||||
if prevStatus == "running" || prevStatus == "pausing" {
|
||||
s.flushAndPersistMetrics(bgCtx, agent, sandboxID, false)
|
||||
}
|
||||
|
||||
if _, err := agent.DestroySandbox(bgCtx, connect.NewRequest(&pb.DestroySandboxRequest{
|
||||
SandboxId: sandboxIDStr,
|
||||
})); err != nil && connect.CodeOf(err) != connect.CodeNotFound {
|
||||
return fmt.Errorf("agent destroy: %w", err)
|
||||
slog.Warn("background destroy failed", "sandbox_id", sandboxIDStr, "error", err)
|
||||
}
|
||||
|
||||
// For a paused sandbox, only keep 24h tier; remove the finer-grained tiers.
|
||||
if sb.Status == "paused" {
|
||||
_ = s.DB.DeleteSandboxMetricPointsByTier(ctx, db.DeleteSandboxMetricPointsByTierParams{
|
||||
if prevStatus == "paused" {
|
||||
_ = s.DB.DeleteSandboxMetricPointsByTier(bgCtx, db.DeleteSandboxMetricPointsByTierParams{
|
||||
SandboxID: sandboxID, Tier: "10m",
|
||||
})
|
||||
_ = s.DB.DeleteSandboxMetricPointsByTier(ctx, db.DeleteSandboxMetricPointsByTierParams{
|
||||
_ = s.DB.DeleteSandboxMetricPointsByTier(bgCtx, db.DeleteSandboxMetricPointsByTierParams{
|
||||
SandboxID: sandboxID, Tier: "2h",
|
||||
})
|
||||
}
|
||||
|
||||
if _, err := s.DB.UpdateSandboxStatus(ctx, db.UpdateSandboxStatusParams{
|
||||
ID: sandboxID, Status: "stopped",
|
||||
if _, err := s.DB.UpdateSandboxStatusIf(bgCtx, db.UpdateSandboxStatusIfParams{
|
||||
ID: sandboxID, Status: "stopping", Status_2: "stopped",
|
||||
}); err != nil {
|
||||
return fmt.Errorf("update status: %w", err)
|
||||
slog.Warn("failed to update sandbox to stopped", "sandbox_id", sandboxIDStr, "error", err)
|
||||
}
|
||||
return nil
|
||||
|
||||
s.publishEvent(bgCtx, SandboxStateEvent{
|
||||
Event: "sandbox.stopped", SandboxID: sandboxIDStr, HostID: hostIDStr,
|
||||
Timestamp: time.Now().Unix(),
|
||||
})
|
||||
}
|
||||
|
||||
// flushAndPersistMetrics calls FlushSandboxMetrics on the agent and stores
|
||||
|
||||
Reference in New Issue
Block a user