forked from wrenn/wrenn
Co-authored-by: Tasnim Kabir Sadik <tksadik@omukk.dev> Reviewed-on: wrenn/wrenn#50
774 lines
29 KiB
Go
774 lines
29 KiB
Go
package service
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"log/slog"
|
|
"time"
|
|
|
|
"connectrpc.com/connect"
|
|
"github.com/jackc/pgx/v5/pgtype"
|
|
|
|
"git.omukk.dev/wrenn/wrenn/pkg/db"
|
|
"git.omukk.dev/wrenn/wrenn/pkg/id"
|
|
"git.omukk.dev/wrenn/wrenn/pkg/lifecycle"
|
|
"git.omukk.dev/wrenn/wrenn/pkg/scheduler"
|
|
"git.omukk.dev/wrenn/wrenn/pkg/validate"
|
|
pb "git.omukk.dev/wrenn/wrenn/proto/hostagent/gen"
|
|
)
|
|
|
|
// SandboxEventPublisher writes sandbox lifecycle events to the Redis stream.
|
|
type SandboxEventPublisher func(ctx context.Context, event SandboxStateEvent)
|
|
|
|
// SandboxStateEvent is the event payload published to the Redis stream.
|
|
type SandboxStateEvent struct {
|
|
Event string `json:"event"`
|
|
SandboxID string `json:"sandbox_id"`
|
|
TeamID string `json:"team_id,omitempty"`
|
|
HostID string `json:"host_id"`
|
|
HostIP string `json:"host_ip,omitempty"`
|
|
Metadata map[string]string `json:"metadata,omitempty"`
|
|
Error string `json:"error,omitempty"`
|
|
Timestamp int64 `json:"timestamp"`
|
|
}
|
|
|
|
// SandboxService provides sandbox lifecycle operations shared between the
|
|
// REST API and the dashboard.
|
|
type SandboxService struct {
|
|
DB *db.Queries
|
|
Pool *lifecycle.HostClientPool
|
|
Scheduler scheduler.HostScheduler
|
|
PublishEvent SandboxEventPublisher
|
|
}
|
|
|
|
// SandboxCreateParams holds the parameters for creating a sandbox.
|
|
type SandboxCreateParams struct {
|
|
TeamID pgtype.UUID
|
|
Template string
|
|
VCPUs int32
|
|
MemoryMB int32
|
|
TimeoutSec int32
|
|
}
|
|
|
|
// MinTimeoutSec mirrors internal/sandbox.MinTimeoutSec. Sub-minute TTLs race
|
|
// the post-create startup window (DB insert → /init → memory loader); the
|
|
// agent silently clamps anyway, but the CP must clamp too so the DB record
|
|
// agrees with what the agent runs. 0 is preserved (no TTL).
|
|
const MinTimeoutSec int32 = 60
|
|
|
|
// clampTimeout normalises a caller-supplied TTL the same way the host agent
|
|
// does. Keep in sync with internal/sandbox.clampTimeout.
|
|
func clampTimeout(timeoutSec int32) int32 {
|
|
if timeoutSec <= 0 {
|
|
return 0
|
|
}
|
|
if timeoutSec < MinTimeoutSec {
|
|
return MinTimeoutSec
|
|
}
|
|
return timeoutSec
|
|
}
|
|
|
|
// agentForSandbox looks up the host for the given sandbox and returns a client.
|
|
func (s *SandboxService) agentForSandbox(ctx context.Context, sandboxID pgtype.UUID) (hostagentClient, db.Sandbox, error) {
|
|
sb, err := s.DB.GetSandbox(ctx, sandboxID)
|
|
if err != nil {
|
|
return nil, db.Sandbox{}, fmt.Errorf("sandbox not found: %w", err)
|
|
}
|
|
agent, err := s.agentForHost(ctx, sb.HostID)
|
|
if err != nil {
|
|
return nil, db.Sandbox{}, err
|
|
}
|
|
return agent, sb, nil
|
|
}
|
|
|
|
// agentForHost returns the host client by host UUID, skipping the sandbox
|
|
// lookup. Used by callers that already have a db.Sandbox in hand.
|
|
func (s *SandboxService) agentForHost(ctx context.Context, hostID pgtype.UUID) (hostagentClient, error) {
|
|
host, err := s.DB.GetHost(ctx, hostID)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("host not found: %w", err)
|
|
}
|
|
agent, err := s.Pool.GetForHost(host)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("get agent client: %w", err)
|
|
}
|
|
return agent, nil
|
|
}
|
|
|
|
func (s *SandboxService) publishEvent(ctx context.Context, event SandboxStateEvent) {
|
|
if s.PublishEvent != nil {
|
|
s.PublishEvent(ctx, event)
|
|
}
|
|
}
|
|
|
|
// hostagentClient is a local alias to avoid the full package path in signatures.
|
|
type hostagentClient = interface {
|
|
CreateSandbox(ctx context.Context, req *connect.Request[pb.CreateSandboxRequest]) (*connect.Response[pb.CreateSandboxResponse], error)
|
|
DestroySandbox(ctx context.Context, req *connect.Request[pb.DestroySandboxRequest]) (*connect.Response[pb.DestroySandboxResponse], error)
|
|
PauseSandbox(ctx context.Context, req *connect.Request[pb.PauseSandboxRequest]) (*connect.Response[pb.PauseSandboxResponse], error)
|
|
ResumeSandbox(ctx context.Context, req *connect.Request[pb.ResumeSandboxRequest]) (*connect.Response[pb.ResumeSandboxResponse], error)
|
|
PingSandbox(ctx context.Context, req *connect.Request[pb.PingSandboxRequest]) (*connect.Response[pb.PingSandboxResponse], error)
|
|
GetSandboxMetrics(ctx context.Context, req *connect.Request[pb.GetSandboxMetricsRequest]) (*connect.Response[pb.GetSandboxMetricsResponse], error)
|
|
FlushSandboxMetrics(ctx context.Context, req *connect.Request[pb.FlushSandboxMetricsRequest]) (*connect.Response[pb.FlushSandboxMetricsResponse], error)
|
|
CreateSnapshot(ctx context.Context, req *connect.Request[pb.CreateSnapshotRequest]) (*connect.Response[pb.CreateSnapshotResponse], error)
|
|
}
|
|
|
|
// Create creates a new sandbox asynchronously: picks a host, inserts a
|
|
// "starting" DB record, fires the agent RPC in a background goroutine, and
|
|
// returns the sandbox immediately. The background goroutine publishes a
|
|
// sandbox event to the Redis stream when the operation completes.
|
|
func (s *SandboxService) Create(ctx context.Context, p SandboxCreateParams) (db.Sandbox, error) {
|
|
if p.Template == "" {
|
|
p.Template = "minimal-ubuntu"
|
|
}
|
|
if err := validate.SafeName(p.Template); err != nil {
|
|
return db.Sandbox{}, fmt.Errorf("invalid template name: %w", err)
|
|
}
|
|
if p.VCPUs <= 0 {
|
|
p.VCPUs = 1
|
|
}
|
|
if p.MemoryMB <= 0 {
|
|
p.MemoryMB = 512
|
|
}
|
|
p.TimeoutSec = clampTimeout(p.TimeoutSec)
|
|
|
|
// Resolve template name → (teamID, templateID). System base templates are
|
|
// platform-owned rows like any other, so the lookup handles them too (the
|
|
// query also matches platform templates for any team).
|
|
tmpl, err := s.DB.GetTemplateByTeam(ctx, db.GetTemplateByTeamParams{Name: p.Template, TeamID: p.TeamID})
|
|
if err != nil {
|
|
return db.Sandbox{}, fmt.Errorf("template %q not found: %w", p.Template, err)
|
|
}
|
|
templateTeamID := tmpl.TeamID
|
|
templateID := tmpl.ID
|
|
templateDefaultUser := tmpl.DefaultUser
|
|
var templateDefaultEnv map[string]string
|
|
if len(tmpl.DefaultEnv) > 0 {
|
|
_ = json.Unmarshal(tmpl.DefaultEnv, &templateDefaultEnv)
|
|
}
|
|
if tmpl.Type == "snapshot" {
|
|
p.VCPUs = tmpl.Vcpus
|
|
p.MemoryMB = tmpl.MemoryMb
|
|
}
|
|
|
|
if !p.TeamID.Valid {
|
|
return db.Sandbox{}, fmt.Errorf("invalid request: team_id is required")
|
|
}
|
|
|
|
team, err := s.DB.GetTeam(ctx, p.TeamID)
|
|
if err != nil {
|
|
return db.Sandbox{}, fmt.Errorf("team not found: %w", err)
|
|
}
|
|
|
|
host, err := s.Scheduler.SelectHost(ctx, p.TeamID, team.IsByoc, p.MemoryMB, 0)
|
|
if err != nil {
|
|
return db.Sandbox{}, fmt.Errorf("select host: %w", err)
|
|
}
|
|
|
|
agent, err := s.Pool.GetForHost(host)
|
|
if err != nil {
|
|
return db.Sandbox{}, fmt.Errorf("get agent client: %w", err)
|
|
}
|
|
|
|
sandboxID := id.NewSandboxID()
|
|
sandboxIDStr := id.FormatSandboxID(sandboxID)
|
|
hostIDStr := id.FormatHostID(host.ID)
|
|
|
|
sb, err := s.DB.InsertSandbox(ctx, db.InsertSandboxParams{
|
|
ID: sandboxID,
|
|
TeamID: p.TeamID,
|
|
HostID: host.ID,
|
|
Template: p.Template,
|
|
Status: "starting",
|
|
Vcpus: p.VCPUs,
|
|
MemoryMb: p.MemoryMB,
|
|
TimeoutSec: p.TimeoutSec,
|
|
DiskSizeMb: 0,
|
|
TemplateID: templateID,
|
|
TemplateTeamID: templateTeamID,
|
|
Metadata: []byte("{}"),
|
|
})
|
|
if err != nil {
|
|
return db.Sandbox{}, fmt.Errorf("insert sandbox: %w", err)
|
|
}
|
|
|
|
teamIDStr := id.FormatTeamID(p.TeamID)
|
|
go s.createInBackground(sandboxID, sandboxIDStr, hostIDStr, teamIDStr, agent, p, templateTeamID, templateID, templateDefaultUser, templateDefaultEnv)
|
|
|
|
return sb, nil
|
|
}
|
|
|
|
func (s *SandboxService) createInBackground(
|
|
sandboxID pgtype.UUID, sandboxIDStr, hostIDStr, teamIDStr string,
|
|
agent hostagentClient, p SandboxCreateParams,
|
|
templateTeamID, templateID pgtype.UUID,
|
|
defaultUser string, defaultEnv map[string]string,
|
|
) {
|
|
bgCtx, cancel := context.WithTimeout(context.Background(), 10*time.Minute)
|
|
defer cancel()
|
|
|
|
resp, err := agent.CreateSandbox(bgCtx, connect.NewRequest(&pb.CreateSandboxRequest{
|
|
SandboxId: sandboxIDStr,
|
|
Template: p.Template,
|
|
TeamId: id.UUIDString(templateTeamID),
|
|
TemplateId: id.UUIDString(templateID),
|
|
Vcpus: p.VCPUs,
|
|
MemoryMb: p.MemoryMB,
|
|
TimeoutSec: p.TimeoutSec,
|
|
DiskSizeMb: 0,
|
|
DefaultUser: defaultUser,
|
|
DefaultEnv: defaultEnv,
|
|
}))
|
|
if err != nil {
|
|
slog.Warn("background create failed", "sandbox_id", sandboxIDStr, "error", err)
|
|
errCtx, errCancel := context.WithTimeout(context.Background(), 10*time.Second)
|
|
defer errCancel()
|
|
if _, dbErr := s.DB.UpdateSandboxStatusIf(errCtx, db.UpdateSandboxStatusIfParams{
|
|
ID: sandboxID, Status: "starting", Status_2: "error",
|
|
}); dbErr != nil {
|
|
slog.Warn("failed to update sandbox to error after create failure", "id", sandboxIDStr, "error", dbErr)
|
|
}
|
|
s.publishEvent(errCtx, SandboxStateEvent{
|
|
Event: "sandbox.failed", SandboxID: sandboxIDStr, TeamID: teamIDStr, HostID: hostIDStr,
|
|
Error: err.Error(), Timestamp: time.Now().Unix(),
|
|
})
|
|
return
|
|
}
|
|
|
|
if resp.Msg.DiskSizeMb > 0 {
|
|
if err := s.DB.UpdateSandboxDiskSize(bgCtx, db.UpdateSandboxDiskSizeParams{
|
|
ID: sandboxID,
|
|
DiskSizeMb: resp.Msg.DiskSizeMb,
|
|
}); err != nil {
|
|
slog.Warn("failed to update sandbox disk size", "id", sandboxIDStr, "error", err)
|
|
}
|
|
}
|
|
|
|
now := time.Now()
|
|
if _, dbErr := s.DB.UpdateSandboxRunningIf(bgCtx, db.UpdateSandboxRunningIfParams{
|
|
ID: sandboxID,
|
|
Status: "starting",
|
|
HostIp: resp.Msg.HostIp,
|
|
StartedAt: pgtype.Timestamptz{
|
|
Time: now,
|
|
Valid: true,
|
|
},
|
|
}); dbErr != nil {
|
|
slog.Warn("failed to update sandbox running after create", "id", sandboxIDStr, "error", dbErr)
|
|
}
|
|
|
|
if meta := resp.Msg.Metadata; len(meta) > 0 {
|
|
metaJSON, _ := json.Marshal(meta)
|
|
if err := s.DB.UpdateSandboxMetadata(bgCtx, db.UpdateSandboxMetadataParams{
|
|
ID: sandboxID, Metadata: metaJSON,
|
|
}); err != nil {
|
|
slog.Warn("failed to store sandbox metadata", "id", sandboxIDStr, "error", err)
|
|
}
|
|
}
|
|
|
|
s.publishEvent(bgCtx, SandboxStateEvent{
|
|
Event: "sandbox.started", SandboxID: sandboxIDStr, TeamID: teamIDStr, HostID: hostIDStr,
|
|
HostIP: resp.Msg.HostIp, Metadata: resp.Msg.Metadata,
|
|
Timestamp: now.Unix(),
|
|
})
|
|
}
|
|
|
|
// List returns active sandboxes (excludes stopped/error) belonging to the given team.
|
|
func (s *SandboxService) List(ctx context.Context, teamID pgtype.UUID) ([]db.Sandbox, error) {
|
|
return s.DB.ListSandboxesByTeam(ctx, teamID)
|
|
}
|
|
|
|
// Get returns a single sandbox by ID, scoped to the given team.
|
|
func (s *SandboxService) Get(ctx context.Context, sandboxID, teamID pgtype.UUID) (db.Sandbox, error) {
|
|
return s.DB.GetSandboxByTeam(ctx, db.GetSandboxByTeamParams{ID: sandboxID, TeamID: teamID})
|
|
}
|
|
|
|
// Pause asynchronously pauses a running sandbox. The DB CAS from "running"
|
|
// to "pausing" is the authoritative gate against concurrent Pause/Destroy
|
|
// calls; if it loses, no agent RPC fires.
|
|
func (s *SandboxService) Pause(ctx context.Context, sandboxID, teamID pgtype.UUID) (db.Sandbox, error) {
|
|
sb, err := s.DB.GetSandboxByTeam(ctx, db.GetSandboxByTeamParams{ID: sandboxID, TeamID: teamID})
|
|
if err != nil {
|
|
return db.Sandbox{}, fmt.Errorf("sandbox not found: %w", err)
|
|
}
|
|
if sb.Status == "paused" {
|
|
return sb, nil
|
|
}
|
|
|
|
if _, err := s.DB.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{
|
|
ID: sandboxID, Status: "running", Status_2: "pausing",
|
|
}); err != nil {
|
|
return db.Sandbox{}, fmt.Errorf("sandbox not in running state (current: %s)", sb.Status)
|
|
}
|
|
|
|
agent, err := s.agentForHost(ctx, sb.HostID)
|
|
if err != nil {
|
|
// Roll back the CAS so the sandbox isn't stuck in "pausing".
|
|
if _, rerr := s.DB.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{
|
|
ID: sandboxID, Status: "pausing", Status_2: "running",
|
|
}); rerr != nil {
|
|
slog.Warn("failed to roll back pausing→running", "id", id.FormatSandboxID(sandboxID), "error", rerr)
|
|
}
|
|
return db.Sandbox{}, err
|
|
}
|
|
|
|
sandboxIDStr := id.FormatSandboxID(sandboxID)
|
|
hostIDStr := id.FormatHostID(sb.HostID)
|
|
teamIDStr := id.FormatTeamID(sb.TeamID)
|
|
|
|
go s.pauseInBackground(sandboxID, sandboxIDStr, hostIDStr, teamIDStr, agent)
|
|
|
|
sb.Status = "pausing"
|
|
return sb, nil
|
|
}
|
|
|
|
func (s *SandboxService) pauseInBackground(sandboxID pgtype.UUID, sandboxIDStr, hostIDStr, teamIDStr string, agent hostagentClient) {
|
|
bgCtx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
|
|
defer cancel()
|
|
|
|
// Flush metrics before the VM stops sampling so the persisted history
|
|
// covers the entire run-up to the pause.
|
|
s.flushAndPersistMetrics(bgCtx, agent, sandboxID, true)
|
|
|
|
if _, err := agent.PauseSandbox(bgCtx, connect.NewRequest(&pb.PauseSandboxRequest{
|
|
SandboxId: sandboxIDStr,
|
|
})); err != nil {
|
|
slog.Warn("background pause failed", "sandbox_id", sandboxIDStr, "error", err)
|
|
// Best-effort: try to recover the sandbox back to "running" so the
|
|
// user isn't stuck in "pausing".
|
|
if _, dbErr := s.DB.UpdateSandboxStatusIf(bgCtx, db.UpdateSandboxStatusIfParams{
|
|
ID: sandboxID, Status: "pausing", Status_2: "running",
|
|
}); dbErr != nil {
|
|
slog.Warn("failed to recover pausing→running after pause failure", "id", sandboxIDStr, "error", dbErr)
|
|
}
|
|
s.publishEvent(bgCtx, SandboxStateEvent{
|
|
Event: "sandbox.pause_failed", SandboxID: sandboxIDStr, TeamID: teamIDStr, HostID: hostIDStr,
|
|
Error: err.Error(), Timestamp: time.Now().Unix(),
|
|
})
|
|
return
|
|
}
|
|
|
|
if _, err := s.DB.UpdateSandboxStatusIf(bgCtx, db.UpdateSandboxStatusIfParams{
|
|
ID: sandboxID, Status: "pausing", Status_2: "paused",
|
|
}); err != nil {
|
|
slog.Warn("failed to update sandbox to paused", "sandbox_id", sandboxIDStr, "error", err)
|
|
}
|
|
|
|
s.publishEvent(bgCtx, SandboxStateEvent{
|
|
Event: "sandbox.paused", SandboxID: sandboxIDStr, TeamID: teamIDStr, HostID: hostIDStr,
|
|
Timestamp: time.Now().Unix(),
|
|
})
|
|
}
|
|
|
|
// Resume asynchronously resumes a paused sandbox on its original host.
|
|
// The DB CAS from "paused" to "resuming" gates concurrent Resume/Destroy.
|
|
func (s *SandboxService) Resume(ctx context.Context, sandboxID, teamID pgtype.UUID) (db.Sandbox, error) {
|
|
sb, err := s.DB.GetSandboxByTeam(ctx, db.GetSandboxByTeamParams{ID: sandboxID, TeamID: teamID})
|
|
if err != nil {
|
|
return db.Sandbox{}, fmt.Errorf("sandbox not found: %w", err)
|
|
}
|
|
if sb.Status == "running" {
|
|
return sb, nil
|
|
}
|
|
|
|
if _, err := s.DB.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{
|
|
ID: sandboxID, Status: "paused", Status_2: "resuming",
|
|
}); err != nil {
|
|
return db.Sandbox{}, fmt.Errorf("sandbox not in paused state (current: %s)", sb.Status)
|
|
}
|
|
|
|
agent, err := s.agentForHost(ctx, sb.HostID)
|
|
if err != nil {
|
|
if _, rerr := s.DB.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{
|
|
ID: sandboxID, Status: "resuming", Status_2: "paused",
|
|
}); rerr != nil {
|
|
slog.Warn("failed to roll back resuming→paused", "id", id.FormatSandboxID(sandboxID), "error", rerr)
|
|
}
|
|
return db.Sandbox{}, err
|
|
}
|
|
|
|
// Look up template defaults so a resumed sandbox has the same env as
|
|
// the original Create did.
|
|
var defaultUser string
|
|
var defaultEnv map[string]string
|
|
if tmpl, terr := s.DB.GetTemplate(ctx, sb.TemplateID); terr == nil {
|
|
defaultUser = tmpl.DefaultUser
|
|
if len(tmpl.DefaultEnv) > 0 {
|
|
_ = json.Unmarshal(tmpl.DefaultEnv, &defaultEnv)
|
|
}
|
|
}
|
|
|
|
sandboxIDStr := id.FormatSandboxID(sandboxID)
|
|
hostIDStr := id.FormatHostID(sb.HostID)
|
|
teamIDStr := id.FormatTeamID(sb.TeamID)
|
|
|
|
go s.resumeInBackground(sandboxID, sandboxIDStr, hostIDStr, teamIDStr, agent, sb.TimeoutSec, defaultUser, defaultEnv)
|
|
|
|
sb.Status = "resuming"
|
|
return sb, nil
|
|
}
|
|
|
|
func (s *SandboxService) resumeInBackground(
|
|
sandboxID pgtype.UUID, sandboxIDStr, hostIDStr, teamIDStr string,
|
|
agent hostagentClient, timeoutSec int32,
|
|
defaultUser string, defaultEnv map[string]string,
|
|
) {
|
|
bgCtx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
|
|
defer cancel()
|
|
|
|
resp, err := agent.ResumeSandbox(bgCtx, connect.NewRequest(&pb.ResumeSandboxRequest{
|
|
SandboxId: sandboxIDStr,
|
|
TimeoutSec: timeoutSec,
|
|
DefaultUser: defaultUser,
|
|
DefaultEnv: defaultEnv,
|
|
}))
|
|
if err != nil {
|
|
slog.Warn("background resume failed", "sandbox_id", sandboxIDStr, "error", err)
|
|
if _, dbErr := s.DB.UpdateSandboxStatusIf(bgCtx, db.UpdateSandboxStatusIfParams{
|
|
ID: sandboxID, Status: "resuming", Status_2: "paused",
|
|
}); dbErr != nil {
|
|
slog.Warn("failed to recover resuming→paused after resume failure", "id", sandboxIDStr, "error", dbErr)
|
|
}
|
|
s.publishEvent(bgCtx, SandboxStateEvent{
|
|
Event: "sandbox.resume_failed", SandboxID: sandboxIDStr, TeamID: teamIDStr, HostID: hostIDStr,
|
|
Error: err.Error(), Timestamp: time.Now().Unix(),
|
|
})
|
|
return
|
|
}
|
|
|
|
now := time.Now()
|
|
if _, err := s.DB.UpdateSandboxRunningIf(bgCtx, db.UpdateSandboxRunningIfParams{
|
|
ID: sandboxID,
|
|
Status: "resuming",
|
|
HostIp: resp.Msg.HostIp,
|
|
StartedAt: pgtype.Timestamptz{Time: now, Valid: true},
|
|
}); err != nil {
|
|
slog.Warn("failed to update sandbox to running after resume", "id", sandboxIDStr, "error", err)
|
|
}
|
|
|
|
if meta := resp.Msg.Metadata; len(meta) > 0 {
|
|
metaJSON, _ := json.Marshal(meta)
|
|
if err := s.DB.UpdateSandboxMetadata(bgCtx, db.UpdateSandboxMetadataParams{
|
|
ID: sandboxID, Metadata: metaJSON,
|
|
}); err != nil {
|
|
slog.Warn("failed to store sandbox metadata after resume", "id", sandboxIDStr, "error", err)
|
|
}
|
|
}
|
|
|
|
s.publishEvent(bgCtx, SandboxStateEvent{
|
|
Event: "sandbox.resumed", SandboxID: sandboxIDStr, TeamID: teamIDStr, HostID: hostIDStr,
|
|
HostIP: resp.Msg.HostIp, Metadata: resp.Msg.Metadata,
|
|
Timestamp: now.Unix(),
|
|
})
|
|
}
|
|
|
|
// CreateSnapshot asynchronously snapshots a running or paused sandbox,
|
|
// publishing the result as a new template owned by the sandbox's team. The DB
|
|
// CAS from the sandbox's current status to "snapshotting" is the authoritative
|
|
// gate against concurrent Pause/Snapshot/Destroy calls; if it loses, no agent
|
|
// RPC fires. A running sandbox is snapshotted live (CH briefly paused, then
|
|
// resumed); a paused sandbox is snapshotted from its on-disk artefacts without
|
|
// reviving the VM. Either way the sandbox returns to its original status on
|
|
// completion. Returns the sandbox (now "snapshotting") and the resolved name.
|
|
func (s *SandboxService) CreateSnapshot(ctx context.Context, sandboxID, teamID pgtype.UUID, name string) (db.Sandbox, string, error) {
|
|
sb, err := s.DB.GetSandboxByTeam(ctx, db.GetSandboxByTeamParams{ID: sandboxID, TeamID: teamID})
|
|
if err != nil {
|
|
return db.Sandbox{}, "", fmt.Errorf("sandbox not found: %w", err)
|
|
}
|
|
if sb.Status != "running" && sb.Status != "paused" {
|
|
return db.Sandbox{}, "", fmt.Errorf("sandbox is not running or paused (status: %s)", sb.Status)
|
|
}
|
|
origStatus := sb.Status
|
|
|
|
if name == "" {
|
|
name = id.NewSnapshotName()
|
|
}
|
|
if err := validate.SafeName(name); err != nil {
|
|
return db.Sandbox{}, "", fmt.Errorf("invalid name: %w", err)
|
|
}
|
|
// Reject duplicate names up front so we don't pause the VM and dump memory
|
|
// only to fail on the template insert at the very end.
|
|
if _, err := s.DB.GetTemplateByTeam(ctx, db.GetTemplateByTeamParams{Name: name, TeamID: teamID}); err == nil {
|
|
return db.Sandbox{}, "", fmt.Errorf("conflict: a snapshot named %q already exists", name)
|
|
}
|
|
|
|
if _, err := s.DB.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{
|
|
ID: sandboxID, Status: origStatus, Status_2: "snapshotting",
|
|
}); err != nil {
|
|
return db.Sandbox{}, "", fmt.Errorf("sandbox not in %s state (current: %s)", origStatus, sb.Status)
|
|
}
|
|
|
|
agent, err := s.agentForHost(ctx, sb.HostID)
|
|
if err != nil {
|
|
// Roll back the CAS so the sandbox isn't stuck in "snapshotting".
|
|
if _, rerr := s.DB.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{
|
|
ID: sandboxID, Status: "snapshotting", Status_2: origStatus,
|
|
}); rerr != nil {
|
|
slog.Warn("failed to roll back snapshotting→"+origStatus, "id", id.FormatSandboxID(sandboxID), "error", rerr)
|
|
}
|
|
return db.Sandbox{}, "", err
|
|
}
|
|
|
|
sandboxIDStr := id.FormatSandboxID(sandboxID)
|
|
hostIDStr := id.FormatHostID(sb.HostID)
|
|
teamIDStr := id.FormatTeamID(sb.TeamID)
|
|
|
|
// Notify other clients that the badge moved to "snapshotting".
|
|
s.publishStateChanged(ctx, sandboxIDStr, teamIDStr, hostIDStr, origStatus, "snapshotting")
|
|
|
|
go s.snapshotInBackground(sandboxID, sandboxIDStr, hostIDStr, teamIDStr, teamID, agent, name, origStatus, sb.Vcpus, sb.MemoryMb)
|
|
|
|
sb.Status = "snapshotting"
|
|
return sb, name, nil
|
|
}
|
|
|
|
func (s *SandboxService) snapshotInBackground(
|
|
sandboxID pgtype.UUID, sandboxIDStr, hostIDStr, teamIDStr string, teamID pgtype.UUID,
|
|
agent hostagentClient, name, origStatus string, vcpus, memoryMB int32,
|
|
) {
|
|
bgCtx, cancel := context.WithTimeout(context.Background(), 10*time.Minute)
|
|
defer cancel()
|
|
|
|
newTemplateID := id.NewSandboxID() // any random UUID
|
|
templateUUID := pgtype.UUID{Bytes: newTemplateID.Bytes, Valid: true}
|
|
|
|
resp, err := agent.CreateSnapshot(bgCtx, connect.NewRequest(&pb.CreateSnapshotRequest{
|
|
SandboxId: sandboxIDStr,
|
|
Name: name,
|
|
TeamId: id.UUIDString(teamID),
|
|
TemplateId: id.UUIDString(templateUUID),
|
|
}))
|
|
|
|
// Either way, the host-side op is done; return the badge to its original
|
|
// status (running for a live snapshot, paused for an on-disk one). Use a CAS
|
|
// so a concurrent Destroy (which sets "stopping") wins: if the CAS misses,
|
|
// the sandbox is no longer ours and we must NOT announce its old status. The
|
|
// snapshot itself is still valid and is registered below — a snapshot
|
|
// template outlives its source sandbox.
|
|
if _, derr := s.DB.UpdateSandboxStatusIf(bgCtx, db.UpdateSandboxStatusIfParams{
|
|
ID: sandboxID, Status: "snapshotting", Status_2: origStatus,
|
|
}); derr != nil {
|
|
slog.Warn("snapshotting→"+origStatus+" CAS missed (sandbox moved on); skipping state signal", "sandbox_id", sandboxIDStr, "error", derr)
|
|
} else {
|
|
s.publishStateChanged(bgCtx, sandboxIDStr, teamIDStr, hostIDStr, "snapshotting", origStatus)
|
|
}
|
|
|
|
if err != nil {
|
|
slog.Warn("background snapshot failed", "sandbox_id", sandboxIDStr, "error", err)
|
|
s.publishEvent(bgCtx, SandboxStateEvent{
|
|
Event: "sandbox.snapshot_failed", SandboxID: sandboxIDStr, TeamID: teamIDStr, HostID: hostIDStr,
|
|
Metadata: map[string]string{"name": name}, Error: err.Error(), Timestamp: time.Now().Unix(),
|
|
})
|
|
return
|
|
}
|
|
|
|
if _, err := s.DB.InsertTemplate(bgCtx, db.InsertTemplateParams{
|
|
ID: templateUUID,
|
|
Name: name,
|
|
Type: "snapshot",
|
|
Vcpus: vcpus,
|
|
MemoryMb: memoryMB,
|
|
SizeBytes: resp.Msg.SizeBytes,
|
|
TeamID: teamID,
|
|
DefaultUser: "",
|
|
DefaultEnv: []byte("{}"),
|
|
Metadata: []byte("{}"),
|
|
}); err != nil {
|
|
slog.Warn("failed to insert snapshot template", "sandbox_id", sandboxIDStr, "name", name, "error", err)
|
|
s.publishEvent(bgCtx, SandboxStateEvent{
|
|
Event: "sandbox.snapshot_failed", SandboxID: sandboxIDStr, TeamID: teamIDStr, HostID: hostIDStr,
|
|
Metadata: map[string]string{"name": name}, Error: "failed to register snapshot", Timestamp: time.Now().Unix(),
|
|
})
|
|
return
|
|
}
|
|
|
|
s.publishEvent(bgCtx, SandboxStateEvent{
|
|
Event: "sandbox.snapshotted", SandboxID: sandboxIDStr, TeamID: teamIDStr, HostID: hostIDStr,
|
|
Metadata: map[string]string{"name": name}, Timestamp: time.Now().Unix(),
|
|
})
|
|
}
|
|
|
|
// publishStateChanged emits a transient capsule.state.changed event so the
|
|
// dashboard flips the status badge during a transition that has no terminal
|
|
// lifecycle verb of its own (e.g. the snapshotting round-trip).
|
|
func (s *SandboxService) publishStateChanged(ctx context.Context, sandboxIDStr, teamIDStr, hostIDStr, from, to string) {
|
|
s.publishEvent(ctx, SandboxStateEvent{
|
|
Event: "sandbox.state_changed", SandboxID: sandboxIDStr, TeamID: teamIDStr, HostID: hostIDStr,
|
|
Metadata: map[string]string{"from": from, "to": to}, Timestamp: time.Now().Unix(),
|
|
})
|
|
}
|
|
|
|
// Destroy stops a sandbox asynchronously. Pre-marks the DB status as
|
|
// "stopping" and fires the agent RPC in a background goroutine.
|
|
func (s *SandboxService) Destroy(ctx context.Context, sandboxID, teamID pgtype.UUID) error {
|
|
sb, err := s.DB.GetSandboxByTeam(ctx, db.GetSandboxByTeamParams{ID: sandboxID, TeamID: teamID})
|
|
if err != nil {
|
|
return fmt.Errorf("sandbox not found: %w", err)
|
|
}
|
|
if sb.Status == "stopped" || sb.Status == "error" {
|
|
return nil
|
|
}
|
|
|
|
agent, _, err := s.agentForSandbox(ctx, sandboxID)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
sandboxIDStr := id.FormatSandboxID(sandboxID)
|
|
hostIDStr := id.FormatHostID(sb.HostID)
|
|
teamIDStr := id.FormatTeamID(sb.TeamID)
|
|
prevStatus := sb.Status
|
|
|
|
if _, err := s.DB.UpdateSandboxStatus(ctx, db.UpdateSandboxStatusParams{
|
|
ID: sandboxID, Status: "stopping",
|
|
}); err != nil {
|
|
return fmt.Errorf("pre-mark stopping: %w", err)
|
|
}
|
|
|
|
go s.destroyInBackground(sandboxID, sandboxIDStr, hostIDStr, teamIDStr, agent, prevStatus)
|
|
|
|
return nil
|
|
}
|
|
|
|
func (s *SandboxService) destroyInBackground(sandboxID pgtype.UUID, sandboxIDStr, hostIDStr, teamIDStr string, agent hostagentClient, prevStatus string) {
|
|
bgCtx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
|
|
defer cancel()
|
|
|
|
if prevStatus == "running" || prevStatus == "pausing" {
|
|
s.flushAndPersistMetrics(bgCtx, agent, sandboxID, false)
|
|
}
|
|
|
|
if _, err := agent.DestroySandbox(bgCtx, connect.NewRequest(&pb.DestroySandboxRequest{
|
|
SandboxId: sandboxIDStr,
|
|
})); err != nil && connect.CodeOf(err) != connect.CodeNotFound {
|
|
slog.Warn("background destroy failed", "sandbox_id", sandboxIDStr, "error", err)
|
|
}
|
|
|
|
if prevStatus == "paused" {
|
|
_ = s.DB.DeleteSandboxMetricPointsByTier(bgCtx, db.DeleteSandboxMetricPointsByTierParams{
|
|
SandboxID: sandboxID, Tier: "10m",
|
|
})
|
|
_ = s.DB.DeleteSandboxMetricPointsByTier(bgCtx, db.DeleteSandboxMetricPointsByTierParams{
|
|
SandboxID: sandboxID, Tier: "2h",
|
|
})
|
|
}
|
|
|
|
if _, err := s.DB.UpdateSandboxStatusIf(bgCtx, db.UpdateSandboxStatusIfParams{
|
|
ID: sandboxID, Status: "stopping", Status_2: "stopped",
|
|
}); err != nil {
|
|
slog.Warn("failed to update sandbox to stopped", "sandbox_id", sandboxIDStr, "error", err)
|
|
}
|
|
|
|
s.publishEvent(bgCtx, SandboxStateEvent{
|
|
Event: "sandbox.stopped", SandboxID: sandboxIDStr, TeamID: teamIDStr, HostID: hostIDStr,
|
|
Timestamp: time.Now().Unix(),
|
|
})
|
|
}
|
|
|
|
// flushAndPersistMetrics calls FlushSandboxMetrics on the agent and stores
|
|
// the returned data to DB. If allTiers is true, all three tiers are saved;
|
|
// otherwise only the 24h tier (for post-destroy analytics).
|
|
func (s *SandboxService) flushAndPersistMetrics(ctx context.Context, agent hostagentClient, sandboxID pgtype.UUID, allTiers bool) {
|
|
sandboxIDStr := id.FormatSandboxID(sandboxID)
|
|
resp, err := agent.FlushSandboxMetrics(ctx, connect.NewRequest(&pb.FlushSandboxMetricsRequest{
|
|
SandboxId: sandboxIDStr,
|
|
}))
|
|
if err != nil {
|
|
slog.Warn("flush metrics failed (best-effort)", "sandbox_id", sandboxIDStr, "error", err)
|
|
return
|
|
}
|
|
msg := resp.Msg
|
|
|
|
if allTiers {
|
|
s.persistMetricPoints(ctx, sandboxID, "10m", msg.Points_10M)
|
|
s.persistMetricPoints(ctx, sandboxID, "2h", msg.Points_2H)
|
|
}
|
|
s.persistMetricPoints(ctx, sandboxID, "24h", msg.Points_24H)
|
|
}
|
|
|
|
func (s *SandboxService) persistMetricPoints(ctx context.Context, sandboxID pgtype.UUID, tier string, points []*pb.MetricPoint) {
|
|
sandboxIDStr := id.FormatSandboxID(sandboxID)
|
|
for _, p := range points {
|
|
if err := s.DB.InsertSandboxMetricPoint(ctx, db.InsertSandboxMetricPointParams{
|
|
SandboxID: sandboxID,
|
|
Tier: tier,
|
|
Ts: p.TimestampUnix,
|
|
CpuPct: p.CpuPct,
|
|
MemBytes: p.MemBytes,
|
|
DiskBytes: p.DiskBytes,
|
|
}); err != nil {
|
|
slog.Warn("persist metric point failed", "sandbox_id", sandboxIDStr, "tier", tier, "error", err)
|
|
}
|
|
}
|
|
}
|
|
|
|
// GetDiskUsage returns the current disk usage in bytes for a sandbox.
|
|
// For running or paused sandboxes, it queries the host agent for live data.
|
|
// For other states or when the agent is unreachable, it falls back to the
|
|
// last known metric point from the database.
|
|
func (s *SandboxService) GetDiskUsage(ctx context.Context, sandboxID, teamID pgtype.UUID) (int64, error) {
|
|
sb, err := s.DB.GetSandboxByTeam(ctx, db.GetSandboxByTeamParams{ID: sandboxID, TeamID: teamID})
|
|
if err != nil {
|
|
return 0, fmt.Errorf("sandbox not found: %w", err)
|
|
}
|
|
|
|
// For running or paused sandboxes, try the agent for live disk usage.
|
|
if sb.Status == "running" || sb.Status == "paused" {
|
|
sandboxIDStr := id.FormatSandboxID(sandboxID)
|
|
agent, hostErr := s.agentForHost(ctx, sb.HostID)
|
|
if hostErr == nil {
|
|
resp, err := agent.GetSandboxMetrics(ctx, connect.NewRequest(&pb.GetSandboxMetricsRequest{
|
|
SandboxId: sandboxIDStr,
|
|
Range: "5m",
|
|
}))
|
|
if err == nil && len(resp.Msg.Points) > 0 {
|
|
last := resp.Msg.Points[len(resp.Msg.Points)-1]
|
|
return last.DiskBytes, nil
|
|
}
|
|
}
|
|
}
|
|
|
|
// Fallback: query the database for the last known metric point.
|
|
point, err := s.DB.GetLatestSandboxMetricPoint(ctx, sandboxID)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
return point.DiskBytes, nil
|
|
}
|
|
|
|
// Ping resets the inactivity timer for a running sandbox.
|
|
func (s *SandboxService) Ping(ctx context.Context, sandboxID, teamID pgtype.UUID) error {
|
|
sb, err := s.DB.GetSandboxByTeam(ctx, db.GetSandboxByTeamParams{ID: sandboxID, TeamID: teamID})
|
|
if err != nil {
|
|
return fmt.Errorf("sandbox not found: %w", err)
|
|
}
|
|
if sb.Status != "running" {
|
|
return fmt.Errorf("sandbox is not running (status: %s)", sb.Status)
|
|
}
|
|
|
|
agent, _, err := s.agentForSandbox(ctx, sandboxID)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
sandboxIDStr := id.FormatSandboxID(sandboxID)
|
|
|
|
if _, err := agent.PingSandbox(ctx, connect.NewRequest(&pb.PingSandboxRequest{
|
|
SandboxId: sandboxIDStr,
|
|
})); err != nil {
|
|
return fmt.Errorf("agent ping: %w", err)
|
|
}
|
|
|
|
if err := s.DB.UpdateLastActive(ctx, db.UpdateLastActiveParams{
|
|
ID: sandboxID,
|
|
LastActiveAt: pgtype.Timestamptz{
|
|
Time: time.Now(),
|
|
Valid: true,
|
|
},
|
|
}); err != nil {
|
|
slog.Warn("ping: failed to update last_active_at", "sandbox_id", sandboxIDStr, "error", err)
|
|
}
|
|
return nil
|
|
}
|