1
0
forked from wrenn/wrenn
Co-authored-by: Tasnim Kabir Sadik <tksadik@omukk.dev>

Reviewed-on: wrenn/wrenn#50
This commit is contained in:
2026-05-24 21:10:37 +00:00
parent 4707f16c76
commit 05ddf62399
203 changed files with 15815 additions and 9344 deletions

View File

@ -2,6 +2,7 @@ package service
import (
"context"
"encoding/base64"
"encoding/json"
"fmt"
"log/slog"
@ -26,14 +27,17 @@ const (
buildCommandTimeout = 30 * time.Second
)
// preBuildCmds run before the user recipe to prepare the build environment.
// apt update runs as root first, then USER switches to wrenn-user for the recipe.
// preBuildCmds run before the recipe to prepare the build environment, as
// root. The build user (USER/WORKDIR) is not injected here — Create prepends
// it to the persisted recipe instead, so "run as root" can omit it with no
// build-level flag to track.
var preBuildCmds = []string{
"RUN apt update",
"USER wrenn-user",
"WORKDIR /home/wrenn-user",
}
// buildUser is the non-root user a recipe runs as unless run_as_root is set.
const buildUser = "wrenn-user"
// postBuildCmds run after the user recipe to clean up caches and reduce image size.
var postBuildCmds = []string{
"RUN apt clean",
@ -47,6 +51,8 @@ type buildAgentClient interface {
CreateSandbox(ctx context.Context, req *connect.Request[pb.CreateSandboxRequest]) (*connect.Response[pb.CreateSandboxResponse], error)
DestroySandbox(ctx context.Context, req *connect.Request[pb.DestroySandboxRequest]) (*connect.Response[pb.DestroySandboxResponse], error)
Exec(ctx context.Context, req *connect.Request[pb.ExecRequest]) (*connect.Response[pb.ExecResponse], error)
PtyAttach(ctx context.Context, req *connect.Request[pb.PtyAttachRequest]) (*connect.ServerStreamForClient[pb.PtyAttachResponse], error)
PtyKill(ctx context.Context, req *connect.Request[pb.PtyKillRequest]) (*connect.Response[pb.PtyKillResponse], error)
WriteFile(ctx context.Context, req *connect.Request[pb.WriteFileRequest]) (*connect.Response[pb.WriteFileResponse], error)
CreateSnapshot(ctx context.Context, req *connect.Request[pb.CreateSnapshotRequest]) (*connect.Response[pb.CreateSnapshotResponse], error)
FlattenRootfs(ctx context.Context, req *connect.Request[pb.FlattenRootfsRequest]) (*connect.Response[pb.FlattenRootfsResponse], error)
@ -73,6 +79,7 @@ type BuildCreateParams struct {
VCPUs int32
MemoryMB int32
SkipPrePost bool
RunAsRoot bool // Run the recipe as root instead of the non-root build user.
Archive []byte // Optional tar/tar.gz/zip archive for COPY commands.
ArchiveName string // Original filename (used to detect format).
}
@ -99,7 +106,7 @@ func (s *BuildService) takeArchive(buildID string) []byte {
// Create inserts a new build record and enqueues it to Redis.
func (s *BuildService) Create(ctx context.Context, p BuildCreateParams) (db.TemplateBuild, error) {
if p.BaseTemplate == "" {
p.BaseTemplate = "minimal"
p.BaseTemplate = "minimal-ubuntu"
}
if p.VCPUs <= 0 {
p.VCPUs = 1
@ -108,7 +115,19 @@ func (s *BuildService) Create(ctx context.Context, p BuildCreateParams) (db.Temp
p.MemoryMB = 512
}
recipeJSON, err := json.Marshal(p.Recipe)
// Assemble the recipe. Unless run_as_root is set, the non-root build user
// is prepended as USER + WORKDIR steps. Persisting it in the recipe means
// "run as root" needs no build-level flag — it simply omits these steps,
// so wrenn-user is never created in a root template.
recipeLines := p.Recipe
if !p.RunAsRoot {
recipeLines = append([]string{
"USER " + buildUser,
"WORKDIR /home/" + buildUser,
}, recipeLines...)
}
recipeJSON, err := json.Marshal(recipeLines)
if err != nil {
return db.TemplateBuild{}, fmt.Errorf("marshal recipe: %w", err)
}
@ -130,7 +149,7 @@ func (s *BuildService) Create(ctx context.Context, p BuildCreateParams) (db.Temp
Healthcheck: p.Healthcheck,
Vcpus: p.VCPUs,
MemoryMb: p.MemoryMB,
TotalSteps: int32(len(p.Recipe) + defaultSteps),
TotalSteps: int32(len(recipeLines) + defaultSteps),
TemplateID: newTemplateID,
TeamID: id.PlatformTeamID,
SkipPrePost: p.SkipPrePost,
@ -183,6 +202,7 @@ func (s *BuildService) Cancel(ctx context.Context, buildID pgtype.UUID) error {
}); err != nil {
return fmt.Errorf("update build status: %w", err)
}
s.publishStatus(ctx, buildID, "cancelled", 0, 0, "")
// If the build is currently running, signal its context.
buildIDStr := id.FormatBuildID(buildID)
@ -274,6 +294,7 @@ func (s *BuildService) executeBuild(ctx context.Context, buildIDStr string) {
log.Error("failed to update build status", "error", err)
return
}
s.publishStatus(buildCtx, buildID, "running", 0, build.TotalSteps, "")
// Parse user recipe.
var userRecipe []string
@ -282,69 +303,11 @@ func (s *BuildService) executeBuild(ctx context.Context, buildIDStr string) {
return
}
// Pick a platform host and create a sandbox.
host, err := s.Scheduler.SelectHost(buildCtx, id.PlatformTeamID, false, build.MemoryMb, 5120)
agent, sandboxIDStr, sandboxMetadata, err := s.provisionBuildSandbox(buildCtx, buildID, buildIDStr, build, log)
if err != nil {
s.failBuild(buildCtx, buildID, fmt.Sprintf("no host available: %v", err))
return
}
agent, err := s.Pool.GetForHost(host)
if err != nil {
s.failBuild(buildCtx, buildID, fmt.Sprintf("agent client error: %v", err))
return
}
sandboxID := id.NewSandboxID()
sandboxIDStr := id.FormatSandboxID(sandboxID)
log = log.With("sandbox_id", sandboxIDStr, "host_id", id.FormatHostID(host.ID))
// Resolve the base template to UUIDs. "minimal" is the zero sentinel.
baseTeamID := id.PlatformTeamID
baseTemplateID := id.MinimalTemplateID
if build.BaseTemplate != "minimal" {
baseTmpl, err := s.DB.GetPlatformTemplateByName(buildCtx, build.BaseTemplate)
if err != nil {
s.failBuild(buildCtx, buildID, fmt.Sprintf("base template %q not found: %v", build.BaseTemplate, err))
return
}
baseTeamID = baseTmpl.TeamID
baseTemplateID = baseTmpl.ID
}
resp, err := agent.CreateSandbox(buildCtx, connect.NewRequest(&pb.CreateSandboxRequest{
SandboxId: sandboxIDStr,
Template: build.BaseTemplate,
TeamId: id.UUIDString(baseTeamID),
TemplateId: id.UUIDString(baseTemplateID),
Vcpus: build.Vcpus,
MemoryMb: build.MemoryMb,
TimeoutSec: 0, // no auto-pause for builds
DiskSizeMb: 5120, // 5 GB for template builds
}))
if err != nil {
s.failBuild(buildCtx, buildID, fmt.Sprintf("create sandbox failed: %v", err))
return
}
// Capture sandbox metadata (envd/kernel/firecracker/agent versions).
sandboxMetadata := resp.Msg.Metadata
// Record sandbox/host association.
_ = s.DB.UpdateBuildSandbox(buildCtx, db.UpdateBuildSandboxParams{
ID: buildID,
SandboxID: sandboxID,
HostID: host.ID,
})
// Upload and extract build archive if provided.
archive := s.takeArchive(buildIDStr)
if len(archive) > 0 {
if err := s.uploadAndExtractArchive(buildCtx, agent, sandboxIDStr, archive, buildIDStr); err != nil {
s.destroySandbox(buildCtx, agent, sandboxIDStr)
s.failBuild(buildCtx, buildID, fmt.Sprintf("archive upload failed: %v", err))
return
}
}
log = log.With("sandbox_id", sandboxIDStr)
// Parse recipe steps. preBuildCmds and postBuildCmds are hardcoded and always
// valid; panic on error is appropriate here since it would be a programmer mistake.
@ -376,16 +339,35 @@ func (s *BuildService) executeBuild(ctx context.Context, buildIDStr string) {
}
bctx := &recipe.ExecContext{EnvVars: envVars, User: "root"}
// Per-step progress callback for live UI updates.
progressFn := func(currentStep int, allEntries []recipe.BuildLogEntry) {
s.updateLogs(buildCtx, buildID, currentStep, allEntries)
}
streamFn := s.ptyStreamExec(agent)
runPhase := func(phase string, steps []recipe.Step, defaultTimeout time.Duration) bool {
newEntries, nextStep, ok := recipe.Execute(buildCtx, phase, steps, sandboxIDStr, step, defaultTimeout, bctx, agent.Exec, func(currentStep int, phaseEntries []recipe.BuildLogEntry) {
// Progress callback: combine prior logs with current phase entries.
progressFn(currentStep, append(logs, phaseEntries...))
})
// step-start: published before each step begins.
onStepStart := func(stepNum int, ph string, st recipe.Step) {
publishBuildEvent(buildCtx, s.Redis, buildIDStr, BuildStreamEvent{
Type: "step-start", Step: stepNum, Phase: ph, Cmd: st.Raw,
})
}
// output: raw PTY bytes from a streaming RUN step, base64-encoded.
onChunk := func(stepNum int, data []byte) {
publishBuildEvent(buildCtx, s.Redis, buildIDStr, BuildStreamEvent{
Type: "output", Step: stepNum, Data: base64.StdEncoding.EncodeToString(data),
})
}
// onProgress: persist the DB log snapshot and publish step-end.
onProgress := func(currentStep int, phaseEntries []recipe.BuildLogEntry) {
s.updateLogs(buildCtx, buildID, currentStep, append(logs, phaseEntries...))
if len(phaseEntries) > 0 {
last := phaseEntries[len(phaseEntries)-1]
publishBuildEvent(buildCtx, s.Redis, buildIDStr, BuildStreamEvent{
Type: "step-end", Step: last.Step, Phase: last.Phase, Cmd: last.Cmd,
Exit: last.Exit, Ok: last.Ok, ElapsedMs: last.Elapsed,
})
}
}
newEntries, nextStep, ok := recipe.Execute(buildCtx, phase, steps, sandboxIDStr, step,
defaultTimeout, bctx, agent.Exec, streamFn, onStepStart, onChunk, onProgress)
logs = append(logs, newEntries...)
step = nextStep
s.updateLogs(buildCtx, buildID, step, logs)
@ -408,15 +390,16 @@ func (s *BuildService) executeBuild(ctx context.Context, buildIDStr string) {
return ok
}
// Phase 1: Pre-build (as root) — creates wrenn-user, updates apt.
// Phase 1: Pre-build (as root) — apt update.
if !build.SkipPrePost {
if !runPhase("pre-build", preBuildSteps, 0) {
return
}
}
// Phase 2: User recipe — starts as wrenn-user (set by USER in pre-build)
// or root if skip_pre_post.
// Phase 2: Recipe — the persisted recipe. For non-root builds it begins
// with the injected USER/WORKDIR steps that create and switch to the build
// user; for run_as_root builds it runs as root throughout.
if !runPhase("recipe", userRecipeSteps, buildCommandTimeout) {
return
}
@ -435,81 +418,186 @@ func (s *BuildService) executeBuild(ctx context.Context, buildIDStr string) {
}
}
// Healthcheck or direct snapshot.
// Finalize: healthcheck/snapshot/flatten → persist template → mark success.
s.finalizeBuild(buildCtx, buildID, build, agent, sandboxIDStr, templateDefaultUser, templateDefaultEnv, sandboxMetadata, log)
}
// provisionBuildSandbox picks a host, creates a sandbox, and uploads the build
// archive. On failure it calls failBuild and returns an error.
func (s *BuildService) provisionBuildSandbox(
ctx context.Context,
buildID pgtype.UUID,
buildIDStr string,
build db.TemplateBuild,
log *slog.Logger,
) (buildAgentClient, string, map[string]string, error) {
host, err := s.Scheduler.SelectHost(ctx, id.PlatformTeamID, false, build.MemoryMb, 5120)
if err != nil {
s.failBuild(ctx, buildID, fmt.Sprintf("no host available: %v", err))
return nil, "", nil, err
}
agent, err := s.Pool.GetForHost(host)
if err != nil {
s.failBuild(ctx, buildID, fmt.Sprintf("agent client error: %v", err))
return nil, "", nil, err
}
sandboxID := id.NewSandboxID()
sandboxIDStr := id.FormatSandboxID(sandboxID)
log.Info("provisioning build sandbox", "sandbox_id", sandboxIDStr, "host_id", id.FormatHostID(host.ID))
// All base templates — including the built-in system ones — are
// platform-owned rows, so resolve the path from the DB record.
baseTmpl, err := s.DB.GetPlatformTemplateByName(ctx, build.BaseTemplate)
if err != nil {
s.failBuild(ctx, buildID, fmt.Sprintf("base template %q not found: %v", build.BaseTemplate, err))
return nil, "", nil, err
}
baseTeamID := baseTmpl.TeamID
baseTemplateID := baseTmpl.ID
resp, err := agent.CreateSandbox(ctx, connect.NewRequest(&pb.CreateSandboxRequest{
SandboxId: sandboxIDStr,
Template: build.BaseTemplate,
TeamId: id.UUIDString(baseTeamID),
TemplateId: id.UUIDString(baseTemplateID),
Vcpus: build.Vcpus,
MemoryMb: build.MemoryMb,
TimeoutSec: 0,
DiskSizeMb: 0,
}))
if err != nil {
s.failBuild(ctx, buildID, fmt.Sprintf("create sandbox failed: %v", err))
return nil, "", nil, err
}
sandboxMetadata := resp.Msg.Metadata
_ = s.DB.UpdateBuildSandbox(ctx, db.UpdateBuildSandboxParams{
ID: buildID,
SandboxID: sandboxID,
HostID: host.ID,
})
if _, err := s.DB.InsertSandbox(ctx, db.InsertSandboxParams{
ID: sandboxID,
TeamID: id.PlatformTeamID,
HostID: host.ID,
Template: build.BaseTemplate,
Status: "running",
Vcpus: build.Vcpus,
MemoryMb: build.MemoryMb,
TimeoutSec: 0,
DiskSizeMb: 0,
TemplateID: baseTemplateID,
TemplateTeamID: baseTeamID,
Metadata: []byte("{}"),
}); err != nil {
log.Warn("failed to insert builder sandbox record", "error", err)
}
if resp.Msg.DiskSizeMb > 0 {
if err := s.DB.UpdateSandboxDiskSize(ctx, db.UpdateSandboxDiskSizeParams{
ID: sandboxID,
DiskSizeMb: resp.Msg.DiskSizeMb,
}); err != nil {
log.Warn("failed to update builder sandbox disk size", "error", err)
}
}
archive := s.takeArchive(buildIDStr)
if len(archive) > 0 {
if err := s.uploadAndExtractArchive(ctx, agent, sandboxIDStr, archive, buildIDStr); err != nil {
s.destroySandbox(ctx, agent, sandboxIDStr)
s.failBuild(ctx, buildID, fmt.Sprintf("archive upload failed: %v", err))
return nil, "", nil, err
}
}
return agent, sandboxIDStr, sandboxMetadata, nil
}
// finalizeBuild handles the healthcheck/snapshot/flatten step and persists the
// template record. Called after all recipe phases complete successfully.
func (s *BuildService) finalizeBuild(
ctx context.Context,
buildID pgtype.UUID,
build db.TemplateBuild,
agent buildAgentClient,
sandboxIDStr string,
defaultUser string,
defaultEnv map[string]string,
sandboxMetadata map[string]string,
log *slog.Logger,
) {
var sizeBytes int64
if build.Healthcheck != "" {
hc, err := recipe.ParseHealthcheck(build.Healthcheck)
if err != nil {
s.destroySandbox(buildCtx, agent, sandboxIDStr)
s.failBuild(buildCtx, buildID, fmt.Sprintf("invalid healthcheck: %v", err))
s.destroySandbox(ctx, agent, sandboxIDStr)
s.failBuild(ctx, buildID, fmt.Sprintf("invalid healthcheck: %v", err))
return
}
log.Info("running healthcheck", "cmd", hc.Cmd, "interval", hc.Interval, "timeout", hc.Timeout, "start_period", hc.StartPeriod, "retries", hc.Retries)
if err := s.waitForHealthcheck(buildCtx, agent, sandboxIDStr, hc, templateDefaultUser); err != nil {
s.destroySandbox(buildCtx, agent, sandboxIDStr)
if buildCtx.Err() != nil {
if err := s.waitForHealthcheck(ctx, agent, sandboxIDStr, hc, defaultUser); err != nil {
s.destroySandbox(ctx, agent, sandboxIDStr)
if ctx.Err() != nil {
return
}
s.failBuild(buildCtx, buildID, fmt.Sprintf("healthcheck failed: %v", err))
s.failBuild(ctx, buildID, fmt.Sprintf("healthcheck failed: %v", err))
return
}
// Healthcheck passed → full snapshot (with memory/CPU state).
log.Info("healthcheck passed, creating snapshot")
snapResp, err := agent.CreateSnapshot(buildCtx, connect.NewRequest(&pb.CreateSnapshotRequest{
snapResp, err := agent.CreateSnapshot(ctx, connect.NewRequest(&pb.CreateSnapshotRequest{
SandboxId: sandboxIDStr,
Name: build.Name,
TeamId: id.UUIDString(build.TeamID),
TemplateId: id.UUIDString(build.TemplateID),
}))
if err != nil {
s.destroySandbox(buildCtx, agent, sandboxIDStr)
if buildCtx.Err() != nil {
s.destroySandbox(ctx, agent, sandboxIDStr)
if ctx.Err() != nil {
return
}
s.failBuild(buildCtx, buildID, fmt.Sprintf("create snapshot failed: %v", err))
s.failBuild(ctx, buildID, fmt.Sprintf("create snapshot failed: %v", err))
return
}
sizeBytes = snapResp.Msg.SizeBytes
} else {
// No healthcheck → image-only template (rootfs only).
log.Info("no healthcheck, flattening rootfs")
flatResp, err := agent.FlattenRootfs(buildCtx, connect.NewRequest(&pb.FlattenRootfsRequest{
flatResp, err := agent.FlattenRootfs(ctx, connect.NewRequest(&pb.FlattenRootfsRequest{
SandboxId: sandboxIDStr,
Name: build.Name,
TeamId: id.UUIDString(build.TeamID),
TemplateId: id.UUIDString(build.TemplateID),
}))
if err != nil {
s.destroySandbox(buildCtx, agent, sandboxIDStr)
if buildCtx.Err() != nil {
s.destroySandbox(ctx, agent, sandboxIDStr)
if ctx.Err() != nil {
return
}
s.failBuild(buildCtx, buildID, fmt.Sprintf("flatten rootfs failed: %v", err))
s.failBuild(ctx, buildID, fmt.Sprintf("flatten rootfs failed: %v", err))
return
}
sizeBytes = flatResp.Msg.SizeBytes
}
// Insert into templates table as a global (platform) template.
templateType := "base"
if build.Healthcheck != "" {
templateType = "snapshot"
}
// Serialize env vars for DB storage.
defaultEnvJSON, err := json.Marshal(templateDefaultEnv)
defaultEnvJSON, err := json.Marshal(defaultEnv)
if err != nil {
defaultEnvJSON = []byte("{}")
}
// Serialize sandbox metadata for DB storage.
metadataJSON, err := json.Marshal(sandboxMetadata)
if err != nil || len(sandboxMetadata) == 0 {
metadataJSON = []byte("{}")
}
if _, err := s.DB.InsertTemplate(buildCtx, db.InsertTemplateParams{
if _, err := s.DB.InsertTemplate(ctx, db.InsertTemplateParams{
ID: build.TemplateID,
Name: build.Name,
Type: templateType,
@ -517,33 +605,28 @@ func (s *BuildService) executeBuild(ctx context.Context, buildIDStr string) {
MemoryMb: build.MemoryMb,
SizeBytes: sizeBytes,
TeamID: id.PlatformTeamID,
DefaultUser: templateDefaultUser,
DefaultUser: defaultUser,
DefaultEnv: defaultEnvJSON,
Metadata: metadataJSON,
}); err != nil {
log.Error("failed to insert template record", "error", err)
// Build succeeded on disk, just DB record failed — don't mark as failed.
}
// Record defaults and metadata on the build record for inspection.
_ = s.DB.UpdateBuildDefaults(buildCtx, db.UpdateBuildDefaultsParams{
_ = s.DB.UpdateBuildDefaults(ctx, db.UpdateBuildDefaultsParams{
ID: buildID,
DefaultUser: templateDefaultUser,
DefaultUser: defaultUser,
DefaultEnv: defaultEnvJSON,
Metadata: metadataJSON,
})
// For CreateSnapshot, the sandbox is already destroyed by the snapshot process.
// For FlattenRootfs, the sandbox is already destroyed by the flatten process.
// No additional destroy needed.
// Mark build as success.
if _, err := s.DB.UpdateBuildStatus(buildCtx, db.UpdateBuildStatusParams{
if _, err := s.DB.UpdateBuildStatus(ctx, db.UpdateBuildStatusParams{
ID: buildID, Status: "success",
}); err != nil {
log.Error("failed to mark build as success", "error", err)
}
s.publishStatus(ctx, buildID, "success", build.TotalSteps, build.TotalSteps, "")
s.destroySandbox(ctx, agent, sandboxIDStr)
log.Info("template build completed successfully", "name", build.Name)
}
@ -642,6 +725,91 @@ func (s *BuildService) failBuild(_ context.Context, buildID pgtype.UUID, errMsg
}); err != nil {
slog.Error("failed to update build error", "build_id", id.FormatBuildID(buildID), "error", err)
}
s.publishStatus(ctx, buildID, "failed", 0, 0, errMsg)
}
// build PTY dimensions — wide enough for tools that adapt output to terminal
// width (apt/pip progress bars).
const (
buildPtyCols = 120
buildPtyRows = 40
)
// publishStatus emits a build-status event to the build's live stream.
func (s *BuildService) publishStatus(ctx context.Context, buildID pgtype.UUID, status string, currentStep, totalSteps int32, errMsg string) {
publishBuildEvent(ctx, s.Redis, id.FormatBuildID(buildID), BuildStreamEvent{
Type: "build-status",
Status: status,
CurrentStep: currentStep,
TotalSteps: totalSteps,
Error: errMsg,
})
}
// ptyStreamExec returns a recipe.StreamExecFunc that runs a shell command in a
// PTY on the build sandbox via the host agent and streams its output. A PTY
// makes build tools emit unbuffered, colorized output (apt/pip progress bars).
func (s *BuildService) ptyStreamExec(agent buildAgentClient) recipe.StreamExecFunc {
return func(ctx context.Context, sandboxID, shellCmd string) (<-chan recipe.PtyChunk, error) {
tag := "build-" + id.NewPtyTag()
stream, err := agent.PtyAttach(ctx, connect.NewRequest(&pb.PtyAttachRequest{
SandboxId: sandboxID,
Tag: tag,
Cmd: "/bin/sh",
Args: []string{"-c", shellCmd},
Cols: buildPtyCols,
Rows: buildPtyRows,
}))
if err != nil {
return nil, err
}
ch := make(chan recipe.PtyChunk, 64)
go func() {
defer close(ch)
defer stream.Close()
gotExit := false
for stream.Receive() {
switch ev := stream.Msg().Event.(type) {
case *pb.PtyAttachResponse_Output:
select {
case ch <- recipe.PtyChunk{Data: ev.Output.Data}:
case <-ctx.Done():
return
}
case *pb.PtyAttachResponse_Exited:
gotExit = true
select {
case ch <- recipe.PtyChunk{Done: true, Exit: ev.Exited.ExitCode}:
case <-ctx.Done():
return
}
}
}
if gotExit {
return
}
// Stream ended with no exit event: timeout, cancellation, or error.
// Kill the lingering guest process so it does not keep running.
streamErr := stream.Err()
if ctx.Err() != nil {
killCtx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
_, _ = agent.PtyKill(killCtx, connect.NewRequest(&pb.PtyKillRequest{
SandboxId: sandboxID, Tag: tag,
}))
cancel()
if streamErr == nil {
streamErr = ctx.Err()
}
}
if streamErr == nil {
streamErr = fmt.Errorf("pty stream ended without an exit event")
}
ch <- recipe.PtyChunk{Err: streamErr}
}()
return ch, nil
}
}
func (s *BuildService) destroySandbox(_ context.Context, agent buildAgentClient, sandboxIDStr string) {
@ -653,6 +821,13 @@ func (s *BuildService) destroySandbox(_ context.Context, agent buildAgentClient,
})); err != nil {
slog.Warn("failed to destroy build sandbox", "sandbox_id", sandboxIDStr, "error", err)
}
if sbID, err := id.ParseSandboxID(sandboxIDStr); err == nil {
if _, err := s.DB.UpdateSandboxStatus(ctx, db.UpdateSandboxStatusParams{
ID: sbID, Status: "stopped",
}); err != nil {
slog.Warn("failed to mark builder sandbox stopped", "sandbox_id", sandboxIDStr, "error", err)
}
}
}
// fetchSandboxEnv executes the 'env' command inside the specified sandbox via
@ -768,7 +943,7 @@ var runtimeEnvVars = map[string]bool{
"HOME": true, "USER": true, "LOGNAME": true, "SHELL": true,
"PWD": true, "OLDPWD": true, "HOSTNAME": true, "TERM": true,
"SHLVL": true, "_": true,
// Per-sandbox identifiers set by envd at boot via MMDS.
// Per-sandbox identifiers set by envd at boot via PostInit.
"WRENN_SANDBOX_ID": true, "WRENN_TEMPLATE_ID": true,
}

143
pkg/service/build_broker.go Normal file
View File

@ -0,0 +1,143 @@
package service
import (
"context"
"encoding/json"
"log/slog"
"sync"
"time"
"github.com/redis/go-redis/v9"
)
// buildSubBuffer is the per-subscriber channel buffer. A slow WebSocket
// consumer that fills the buffer drops live events; it recovers the full
// build state from the DB log on reconnect.
const buildSubBuffer = 256
// buildBrokerReconnect is the backoff before re-subscribing to Redis after a
// subscription error.
const buildBrokerReconnect = 2 * time.Second
// BuildBroker fans build events out from per-build Redis pub/sub channels to
// in-process WebSocket subscribers. A Redis subscription is started lazily for
// a build when its first client connects and torn down when the last leaves.
//
// The build worker publishes via publishBuildEvent (Redis only); the broker is
// purely the read/fan-out side. Decoupling through Redis means the worker and
// the WebSocket handler need not run in the same process.
type BuildBroker struct {
rdb *redis.Client
mu sync.Mutex
builds map[string]*buildFanout
}
type buildFanout struct {
subs map[chan BuildStreamEvent]struct{}
cancel context.CancelFunc
}
// NewBuildBroker creates a broker reading from the given Redis client.
func NewBuildBroker(rdb *redis.Client) *BuildBroker {
return &BuildBroker{rdb: rdb, builds: make(map[string]*buildFanout)}
}
// Subscribe registers an in-process subscriber for buildID's event stream and
// returns the receive channel plus a release function. The first subscriber
// for a build starts its Redis subscription; the last to release stops it.
// The release function is idempotent and closes the channel.
func (b *BuildBroker) Subscribe(buildID string) (<-chan BuildStreamEvent, func()) {
ch := make(chan BuildStreamEvent, buildSubBuffer)
b.mu.Lock()
fan, ok := b.builds[buildID]
if !ok {
ctx, cancel := context.WithCancel(context.Background())
fan = &buildFanout{subs: make(map[chan BuildStreamEvent]struct{}), cancel: cancel}
b.builds[buildID] = fan
go b.run(ctx, buildID)
}
fan.subs[ch] = struct{}{}
b.mu.Unlock()
var once sync.Once
release := func() {
once.Do(func() {
b.mu.Lock()
defer b.mu.Unlock()
fan, ok := b.builds[buildID]
if !ok {
return
}
if _, present := fan.subs[ch]; !present {
return
}
delete(fan.subs, ch)
close(ch)
if len(fan.subs) == 0 {
fan.cancel()
delete(b.builds, buildID)
}
})
}
return ch, release
}
// run keeps a Redis subscription alive for buildID, reconnecting on error,
// until the fanout's context is cancelled (last subscriber left).
func (b *BuildBroker) run(ctx context.Context, buildID string) {
for ctx.Err() == nil {
b.subscribeOnce(ctx, buildID)
if ctx.Err() != nil {
return
}
select {
case <-ctx.Done():
return
case <-time.After(buildBrokerReconnect):
}
}
}
func (b *BuildBroker) subscribeOnce(ctx context.Context, buildID string) {
sub := b.rdb.Subscribe(ctx, buildStreamChannel(buildID))
defer sub.Close()
msgCh := sub.Channel()
for {
select {
case <-ctx.Done():
return
case msg, ok := <-msgCh:
if !ok {
return
}
var ev BuildStreamEvent
if err := json.Unmarshal([]byte(msg.Payload), &ev); err != nil {
slog.Warn("build broker: bad event payload", "build_id", buildID, "error", err)
continue
}
b.dispatch(buildID, ev)
}
}
}
// dispatch fans one event to every in-process subscriber. The send is
// non-blocking; a full subscriber buffer drops the event. The mutex is held
// for the whole dispatch so a concurrent release cannot close a channel
// mid-send.
func (b *BuildBroker) dispatch(buildID string, ev BuildStreamEvent) {
b.mu.Lock()
defer b.mu.Unlock()
fan, ok := b.builds[buildID]
if !ok {
return
}
for ch := range fan.subs {
select {
case ch <- ev:
default:
slog.Debug("build broker: dropped event for slow consumer", "build_id", buildID)
}
}
}

View File

@ -0,0 +1,72 @@
package service
import (
"context"
"encoding/json"
"log/slog"
"time"
"github.com/redis/go-redis/v9"
)
// buildStreamChannelPrefix is the Redis pub/sub channel prefix for live build
// events. One channel per build: wrenn:build:{buildID}.
const buildStreamChannelPrefix = "wrenn:build:"
func buildStreamChannel(buildID string) string {
return buildStreamChannelPrefix + buildID
}
// BuildStreamEvent is one event in a build's live stream. The same struct is
// published to Redis by the build worker and forwarded verbatim to admin
// WebSocket clients, so its JSON shape is the wire contract for both.
//
// Type discriminates the payload:
// - "step-start": Step, Phase, Cmd set.
// - "output": Step, Data (base64 PTY bytes) set.
// - "step-end": Step, Phase, Cmd, Exit, Ok, ElapsedMs set.
// - "build-status": Status, CurrentStep, TotalSteps, Error set.
type BuildStreamEvent struct {
Type string `json:"type"`
Step int `json:"step,omitempty"`
Phase string `json:"phase,omitempty"`
Cmd string `json:"cmd,omitempty"`
Data string `json:"data,omitempty"` // base64-encoded PTY output bytes
Exit int32 `json:"exit,omitempty"`
Ok bool `json:"ok,omitempty"`
ElapsedMs int64 `json:"elapsed_ms,omitempty"`
Status string `json:"status,omitempty"`
CurrentStep int32 `json:"current_step,omitempty"`
TotalSteps int32 `json:"total_steps,omitempty"`
Error string `json:"error,omitempty"`
T int64 `json:"t"` // unix milliseconds, set at publish time
}
// IsTerminalBuildStatus reports whether a build status is final (the worker
// will publish no further events for it).
func IsTerminalBuildStatus(status string) bool {
switch status {
case "success", "failed", "cancelled":
return true
default:
return false
}
}
// publishBuildEvent fire-and-forget publishes one event to a build's Redis
// channel. A missing/closed Redis connection only drops live events; the WS
// client always has the DB log history to fall back on.
func publishBuildEvent(ctx context.Context, rdb *redis.Client, buildID string, ev BuildStreamEvent) {
if rdb == nil {
return
}
ev.T = time.Now().UnixMilli()
payload, err := json.Marshal(ev)
if err != nil {
slog.Warn("build event marshal failed", "build_id", buildID, "error", err)
return
}
if err := rdb.Publish(ctx, buildStreamChannel(buildID), payload).Err(); err != nil {
slog.Debug("build event publish failed", "build_id", buildID, "error", err)
}
}

View File

@ -94,6 +94,31 @@ type regTokenPayload struct {
const regTokenTTL = time.Hour
func (s *HostService) issueRegistrationToken(ctx context.Context, hostID, createdBy pgtype.UUID) (string, error) {
token := id.NewRegistrationToken()
tokenID := id.NewHostTokenID()
payload, _ := json.Marshal(regTokenPayload{
HostID: id.FormatHostID(hostID),
TokenID: id.FormatHostTokenID(tokenID),
})
if err := s.Redis.Set(ctx, "host:reg:"+token, payload, regTokenTTL).Err(); err != nil {
return "", fmt.Errorf("store registration token: %w", err)
}
now := time.Now()
if _, err := s.DB.InsertHostToken(ctx, db.InsertHostTokenParams{
ID: tokenID,
HostID: hostID,
CreatedBy: createdBy,
ExpiresAt: pgtype.Timestamptz{Time: now.Add(regTokenTTL), Valid: true},
}); err != nil {
slog.Warn("failed to insert host token audit record", "host_id", id.FormatHostID(hostID), "error", err)
}
return token, nil
}
// requireAdminOrOwner returns nil iff the role is "owner" or "admin".
func requireAdminOrOwner(role string) error {
if role == "owner" || role == "admin" {
@ -159,26 +184,9 @@ func (s *HostService) Create(ctx context.Context, p HostCreateParams) (HostCreat
return HostCreateResult{}, fmt.Errorf("insert host: %w", err)
}
// Generate registration token and store in Redis + Postgres audit trail.
token := id.NewRegistrationToken()
tokenID := id.NewHostTokenID()
payload, _ := json.Marshal(regTokenPayload{
HostID: id.FormatHostID(hostID),
TokenID: id.FormatHostTokenID(tokenID),
})
if err := s.Redis.Set(ctx, "host:reg:"+token, payload, regTokenTTL).Err(); err != nil {
return HostCreateResult{}, fmt.Errorf("store registration token: %w", err)
}
now := time.Now()
if _, err := s.DB.InsertHostToken(ctx, db.InsertHostTokenParams{
ID: tokenID,
HostID: hostID,
CreatedBy: p.RequestingUserID,
ExpiresAt: pgtype.Timestamptz{Time: now.Add(regTokenTTL), Valid: true},
}); err != nil {
slog.Warn("failed to insert host token audit record", "host_id", id.FormatHostID(hostID), "error", err)
token, err := s.issueRegistrationToken(ctx, hostID, p.RequestingUserID)
if err != nil {
return HostCreateResult{}, err
}
return HostCreateResult{Host: host, RegistrationToken: token}, nil
@ -218,25 +226,9 @@ func (s *HostService) RegenerateToken(ctx context.Context, hostID, userID, teamI
}
}
token := id.NewRegistrationToken()
tokenID := id.NewHostTokenID()
payload, _ := json.Marshal(regTokenPayload{
HostID: id.FormatHostID(hostID),
TokenID: id.FormatHostTokenID(tokenID),
})
if err := s.Redis.Set(ctx, "host:reg:"+token, payload, regTokenTTL).Err(); err != nil {
return HostCreateResult{}, fmt.Errorf("store registration token: %w", err)
}
now := time.Now()
if _, err := s.DB.InsertHostToken(ctx, db.InsertHostTokenParams{
ID: tokenID,
HostID: hostID,
CreatedBy: userID,
ExpiresAt: pgtype.Timestamptz{Time: now.Add(regTokenTTL), Valid: true},
}); err != nil {
slog.Warn("failed to insert host token audit record", "host_id", id.FormatHostID(hostID), "error", err)
token, err := s.issueRegistrationToken(ctx, hostID, userID)
if err != nil {
return HostCreateResult{}, err
}
return HostCreateResult{Host: host, RegistrationToken: token}, nil
@ -434,13 +426,27 @@ func (s *HostService) Heartbeat(ctx context.Context, hostID pgtype.UUID) error {
// List returns hosts visible to the caller.
// Admins see all hosts; non-admins see only BYOC hosts belonging to their team.
func (s *HostService) List(ctx context.Context, teamID pgtype.UUID, isAdmin bool) ([]db.Host, error) {
func (s *HostService) List(ctx context.Context, teamID pgtype.UUID, isAdmin bool) ([]db.ListHostsByTeamRow, error) {
if isAdmin {
return s.DB.ListHosts(ctx)
rows, err := s.DB.ListHostsAdmin(ctx)
if err != nil {
return nil, err
}
result := make([]db.ListHostsByTeamRow, len(rows))
for i, r := range rows {
result[i] = db.ListHostsByTeamRow(r)
}
return result, nil
}
return s.DB.ListHostsByTeam(ctx, teamID)
}
// ListAdmin returns all hosts with aggregated resource consumption.
// Admin-only — caller must verify admin status.
func (s *HostService) ListAdmin(ctx context.Context) ([]db.ListHostsAdminRow, error) {
return s.DB.ListHostsAdmin(ctx)
}
// Get returns a single host, enforcing access control.
func (s *HostService) Get(ctx context.Context, hostID, teamID pgtype.UUID, isAdmin bool) (db.Host, error) {
host, err := s.DB.GetHost(ctx, hostID)

View File

@ -18,12 +18,28 @@ import (
pb "git.omukk.dev/wrenn/wrenn/proto/hostagent/gen"
)
// SandboxEventPublisher writes sandbox lifecycle events to the Redis stream.
type SandboxEventPublisher func(ctx context.Context, event SandboxStateEvent)
// SandboxStateEvent is the event payload published to the Redis stream.
type SandboxStateEvent struct {
Event string `json:"event"`
SandboxID string `json:"sandbox_id"`
TeamID string `json:"team_id,omitempty"`
HostID string `json:"host_id"`
HostIP string `json:"host_ip,omitempty"`
Metadata map[string]string `json:"metadata,omitempty"`
Error string `json:"error,omitempty"`
Timestamp int64 `json:"timestamp"`
}
// SandboxService provides sandbox lifecycle operations shared between the
// REST API and the dashboard.
type SandboxService struct {
DB *db.Queries
Pool *lifecycle.HostClientPool
Scheduler scheduler.HostScheduler
DB *db.Queries
Pool *lifecycle.HostClientPool
Scheduler scheduler.HostScheduler
PublishEvent SandboxEventPublisher
}
// SandboxCreateParams holds the parameters for creating a sandbox.
@ -33,7 +49,24 @@ type SandboxCreateParams struct {
VCPUs int32
MemoryMB int32
TimeoutSec int32
DiskSizeMB int32
}
// MinTimeoutSec mirrors internal/sandbox.MinTimeoutSec. Sub-minute TTLs race
// the post-create startup window (DB insert → /init → memory loader); the
// agent silently clamps anyway, but the CP must clamp too so the DB record
// agrees with what the agent runs. 0 is preserved (no TTL).
const MinTimeoutSec int32 = 60
// clampTimeout normalises a caller-supplied TTL the same way the host agent
// does. Keep in sync with internal/sandbox.clampTimeout.
func clampTimeout(timeoutSec int32) int32 {
if timeoutSec <= 0 {
return 0
}
if timeoutSec < MinTimeoutSec {
return MinTimeoutSec
}
return timeoutSec
}
// agentForSandbox looks up the host for the given sandbox and returns a client.
@ -42,15 +75,31 @@ func (s *SandboxService) agentForSandbox(ctx context.Context, sandboxID pgtype.U
if err != nil {
return nil, db.Sandbox{}, fmt.Errorf("sandbox not found: %w", err)
}
host, err := s.DB.GetHost(ctx, sb.HostID)
agent, err := s.agentForHost(ctx, sb.HostID)
if err != nil {
return nil, db.Sandbox{}, fmt.Errorf("host not found for sandbox: %w", err)
return nil, db.Sandbox{}, err
}
return agent, sb, nil
}
// agentForHost returns the host client by host UUID, skipping the sandbox
// lookup. Used by callers that already have a db.Sandbox in hand.
func (s *SandboxService) agentForHost(ctx context.Context, hostID pgtype.UUID) (hostagentClient, error) {
host, err := s.DB.GetHost(ctx, hostID)
if err != nil {
return nil, fmt.Errorf("host not found: %w", err)
}
agent, err := s.Pool.GetForHost(host)
if err != nil {
return nil, db.Sandbox{}, fmt.Errorf("get agent client: %w", err)
return nil, fmt.Errorf("get agent client: %w", err)
}
return agent, nil
}
func (s *SandboxService) publishEvent(ctx context.Context, event SandboxStateEvent) {
if s.PublishEvent != nil {
s.PublishEvent(ctx, event)
}
return agent, sb, nil
}
// hostagentClient is a local alias to avoid the full package path in signatures.
@ -62,13 +111,16 @@ type hostagentClient = interface {
PingSandbox(ctx context.Context, req *connect.Request[pb.PingSandboxRequest]) (*connect.Response[pb.PingSandboxResponse], error)
GetSandboxMetrics(ctx context.Context, req *connect.Request[pb.GetSandboxMetricsRequest]) (*connect.Response[pb.GetSandboxMetricsResponse], error)
FlushSandboxMetrics(ctx context.Context, req *connect.Request[pb.FlushSandboxMetricsRequest]) (*connect.Response[pb.FlushSandboxMetricsResponse], error)
CreateSnapshot(ctx context.Context, req *connect.Request[pb.CreateSnapshotRequest]) (*connect.Response[pb.CreateSnapshotResponse], error)
}
// Create creates a new sandbox: picks a host via the scheduler, inserts a pending
// DB record, calls the host agent, and updates the record to running.
// Create creates a new sandbox asynchronously: picks a host, inserts a
// "starting" DB record, fires the agent RPC in a background goroutine, and
// returns the sandbox immediately. The background goroutine publishes a
// sandbox event to the Redis stream when the operation completes.
func (s *SandboxService) Create(ctx context.Context, p SandboxCreateParams) (db.Sandbox, error) {
if p.Template == "" {
p.Template = "minimal"
p.Template = "minimal-ubuntu"
}
if err := validate.SafeName(p.Template); err != nil {
return db.Sandbox{}, fmt.Errorf("invalid template name: %w", err)
@ -79,46 +131,37 @@ func (s *SandboxService) Create(ctx context.Context, p SandboxCreateParams) (db.
if p.MemoryMB <= 0 {
p.MemoryMB = 512
}
if p.DiskSizeMB <= 0 {
p.DiskSizeMB = 5120 // 5 GB default
}
p.TimeoutSec = clampTimeout(p.TimeoutSec)
// Resolve template name → (teamID, templateID).
templateTeamID := id.PlatformTeamID
templateID := id.MinimalTemplateID
var templateDefaultUser string
// Resolve template name → (teamID, templateID). System base templates are
// platform-owned rows like any other, so the lookup handles them too (the
// query also matches platform templates for any team).
tmpl, err := s.DB.GetTemplateByTeam(ctx, db.GetTemplateByTeamParams{Name: p.Template, TeamID: p.TeamID})
if err != nil {
return db.Sandbox{}, fmt.Errorf("template %q not found: %w", p.Template, err)
}
templateTeamID := tmpl.TeamID
templateID := tmpl.ID
templateDefaultUser := tmpl.DefaultUser
var templateDefaultEnv map[string]string
if p.Template != "minimal" {
tmpl, err := s.DB.GetTemplateByTeam(ctx, db.GetTemplateByTeamParams{Name: p.Template, TeamID: p.TeamID})
if err != nil {
return db.Sandbox{}, fmt.Errorf("template %q not found: %w", p.Template, err)
}
templateTeamID = tmpl.TeamID
templateID = tmpl.ID
templateDefaultUser = tmpl.DefaultUser
// Parse default_env JSONB into a map.
if len(tmpl.DefaultEnv) > 0 {
_ = json.Unmarshal(tmpl.DefaultEnv, &templateDefaultEnv)
}
// If the template is a snapshot, use its baked-in vcpus/memory.
if tmpl.Type == "snapshot" {
p.VCPUs = tmpl.Vcpus
p.MemoryMB = tmpl.MemoryMb
}
if len(tmpl.DefaultEnv) > 0 {
_ = json.Unmarshal(tmpl.DefaultEnv, &templateDefaultEnv)
}
if tmpl.Type == "snapshot" {
p.VCPUs = tmpl.Vcpus
p.MemoryMB = tmpl.MemoryMb
}
if !p.TeamID.Valid {
return db.Sandbox{}, fmt.Errorf("invalid request: team_id is required")
}
// Determine whether this team uses BYOC hosts or platform hosts.
team, err := s.DB.GetTeam(ctx, p.TeamID)
if err != nil {
return db.Sandbox{}, fmt.Errorf("team not found: %w", err)
}
// Pick a host for this sandbox.
host, err := s.Scheduler.SelectHost(ctx, p.TeamID, team.IsByoc, p.MemoryMB, p.DiskSizeMB)
host, err := s.Scheduler.SelectHost(ctx, p.TeamID, team.IsByoc, p.MemoryMB, 0)
if err != nil {
return db.Sandbox{}, fmt.Errorf("select host: %w", err)
}
@ -130,25 +173,42 @@ func (s *SandboxService) Create(ctx context.Context, p SandboxCreateParams) (db.
sandboxID := id.NewSandboxID()
sandboxIDStr := id.FormatSandboxID(sandboxID)
hostIDStr := id.FormatHostID(host.ID)
if _, err := s.DB.InsertSandbox(ctx, db.InsertSandboxParams{
sb, err := s.DB.InsertSandbox(ctx, db.InsertSandboxParams{
ID: sandboxID,
TeamID: p.TeamID,
HostID: host.ID,
Template: p.Template,
Status: "pending",
Status: "starting",
Vcpus: p.VCPUs,
MemoryMb: p.MemoryMB,
TimeoutSec: p.TimeoutSec,
DiskSizeMb: p.DiskSizeMB,
DiskSizeMb: 0,
TemplateID: templateID,
TemplateTeamID: templateTeamID,
Metadata: []byte("{}"),
}); err != nil {
})
if err != nil {
return db.Sandbox{}, fmt.Errorf("insert sandbox: %w", err)
}
resp, err := agent.CreateSandbox(ctx, connect.NewRequest(&pb.CreateSandboxRequest{
teamIDStr := id.FormatTeamID(p.TeamID)
go s.createInBackground(sandboxID, sandboxIDStr, hostIDStr, teamIDStr, agent, p, templateTeamID, templateID, templateDefaultUser, templateDefaultEnv)
return sb, nil
}
func (s *SandboxService) createInBackground(
sandboxID pgtype.UUID, sandboxIDStr, hostIDStr, teamIDStr string,
agent hostagentClient, p SandboxCreateParams,
templateTeamID, templateID pgtype.UUID,
defaultUser string, defaultEnv map[string]string,
) {
bgCtx, cancel := context.WithTimeout(context.Background(), 10*time.Minute)
defer cancel()
resp, err := agent.CreateSandbox(bgCtx, connect.NewRequest(&pb.CreateSandboxRequest{
SandboxId: sandboxIDStr,
Template: p.Template,
TeamId: id.UUIDString(templateTeamID),
@ -156,46 +216,62 @@ func (s *SandboxService) Create(ctx context.Context, p SandboxCreateParams) (db.
Vcpus: p.VCPUs,
MemoryMb: p.MemoryMB,
TimeoutSec: p.TimeoutSec,
DiskSizeMb: p.DiskSizeMB,
DefaultUser: templateDefaultUser,
DefaultEnv: templateDefaultEnv,
DiskSizeMb: 0,
DefaultUser: defaultUser,
DefaultEnv: defaultEnv,
}))
if err != nil {
if _, dbErr := s.DB.UpdateSandboxStatus(ctx, db.UpdateSandboxStatusParams{
ID: sandboxID, Status: "error",
slog.Warn("background create failed", "sandbox_id", sandboxIDStr, "error", err)
errCtx, errCancel := context.WithTimeout(context.Background(), 10*time.Second)
defer errCancel()
if _, dbErr := s.DB.UpdateSandboxStatusIf(errCtx, db.UpdateSandboxStatusIfParams{
ID: sandboxID, Status: "starting", Status_2: "error",
}); dbErr != nil {
slog.Warn("failed to update sandbox status to error", "id", sandboxIDStr, "error", dbErr)
slog.Warn("failed to update sandbox to error after create failure", "id", sandboxIDStr, "error", dbErr)
}
s.publishEvent(errCtx, SandboxStateEvent{
Event: "sandbox.failed", SandboxID: sandboxIDStr, TeamID: teamIDStr, HostID: hostIDStr,
Error: err.Error(), Timestamp: time.Now().Unix(),
})
return
}
if resp.Msg.DiskSizeMb > 0 {
if err := s.DB.UpdateSandboxDiskSize(bgCtx, db.UpdateSandboxDiskSizeParams{
ID: sandboxID,
DiskSizeMb: resp.Msg.DiskSizeMb,
}); err != nil {
slog.Warn("failed to update sandbox disk size", "id", sandboxIDStr, "error", err)
}
return db.Sandbox{}, fmt.Errorf("agent create: %w", err)
}
now := time.Now()
sb, err := s.DB.UpdateSandboxRunning(ctx, db.UpdateSandboxRunningParams{
ID: sandboxID,
HostIp: resp.Msg.HostIp,
GuestIp: "",
if _, dbErr := s.DB.UpdateSandboxRunningIf(bgCtx, db.UpdateSandboxRunningIfParams{
ID: sandboxID,
Status: "starting",
HostIp: resp.Msg.HostIp,
StartedAt: pgtype.Timestamptz{
Time: now,
Valid: true,
},
})
if err != nil {
return db.Sandbox{}, fmt.Errorf("update sandbox running: %w", err)
}); dbErr != nil {
slog.Warn("failed to update sandbox running after create", "id", sandboxIDStr, "error", dbErr)
}
// Store runtime metadata from the agent (envd/kernel/firecracker/agent versions).
if meta := resp.Msg.Metadata; len(meta) > 0 {
metaJSON, _ := json.Marshal(meta)
if err := s.DB.UpdateSandboxMetadata(ctx, db.UpdateSandboxMetadataParams{
ID: sandboxID,
Metadata: metaJSON,
if err := s.DB.UpdateSandboxMetadata(bgCtx, db.UpdateSandboxMetadataParams{
ID: sandboxID, Metadata: metaJSON,
}); err != nil {
slog.Warn("failed to store sandbox metadata", "id", sandboxIDStr, "error", err)
}
sb.Metadata = metaJSON
}
return sb, nil
s.publishEvent(bgCtx, SandboxStateEvent{
Event: "sandbox.started", SandboxID: sandboxIDStr, TeamID: teamIDStr, HostID: hostIDStr,
HostIP: resp.Msg.HostIp, Metadata: resp.Msg.Metadata,
Timestamp: now.Unix(),
})
}
// List returns active sandboxes (excludes stopped/error) belonging to the given team.
@ -208,152 +284,331 @@ func (s *SandboxService) Get(ctx context.Context, sandboxID, teamID pgtype.UUID)
return s.DB.GetSandboxByTeam(ctx, db.GetSandboxByTeamParams{ID: sandboxID, TeamID: teamID})
}
// Pause snapshots and freezes a running sandbox to disk.
// Pause asynchronously pauses a running sandbox. The DB CAS from "running"
// to "pausing" is the authoritative gate against concurrent Pause/Destroy
// calls; if it loses, no agent RPC fires.
func (s *SandboxService) Pause(ctx context.Context, sandboxID, teamID pgtype.UUID) (db.Sandbox, error) {
sb, err := s.DB.GetSandboxByTeam(ctx, db.GetSandboxByTeamParams{ID: sandboxID, TeamID: teamID})
if err != nil {
return db.Sandbox{}, fmt.Errorf("sandbox not found: %w", err)
}
if sb.Status != "running" {
return db.Sandbox{}, fmt.Errorf("sandbox is not running (status: %s)", sb.Status)
if sb.Status == "paused" {
return sb, nil
}
agent, _, err := s.agentForSandbox(ctx, sandboxID)
if _, err := s.DB.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{
ID: sandboxID, Status: "running", Status_2: "pausing",
}); err != nil {
return db.Sandbox{}, fmt.Errorf("sandbox not in running state (current: %s)", sb.Status)
}
agent, err := s.agentForHost(ctx, sb.HostID)
if err != nil {
// Roll back the CAS so the sandbox isn't stuck in "pausing".
if _, rerr := s.DB.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{
ID: sandboxID, Status: "pausing", Status_2: "running",
}); rerr != nil {
slog.Warn("failed to roll back pausing→running", "id", id.FormatSandboxID(sandboxID), "error", rerr)
}
return db.Sandbox{}, err
}
sandboxIDStr := id.FormatSandboxID(sandboxID)
hostIDStr := id.FormatHostID(sb.HostID)
teamIDStr := id.FormatTeamID(sb.TeamID)
// Pre-mark as "paused" in DB before the RPC so the reconciler does not
// mark the sandbox "stopped" while the host agent processes the pause.
if _, err := s.DB.UpdateSandboxStatus(ctx, db.UpdateSandboxStatusParams{
ID: sandboxID, Status: "paused",
}); err != nil {
return db.Sandbox{}, fmt.Errorf("pre-mark paused: %w", err)
}
go s.pauseInBackground(sandboxID, sandboxIDStr, hostIDStr, teamIDStr, agent)
// Flush all metrics tiers before pausing so data survives in DB.
s.flushAndPersistMetrics(ctx, agent, sandboxID, true)
if _, err := agent.PauseSandbox(ctx, connect.NewRequest(&pb.PauseSandboxRequest{
SandboxId: sandboxIDStr,
})); err != nil {
// Check if the agent still has this sandbox. If it was destroyed
// (e.g. frozen VM couldn't be resumed), mark as "error" instead of
// reverting to "running" — which would create a ghost record.
// Use a fresh context since the original ctx may already be expired.
revertStatus := "running"
pingCtx, pingCancel := context.WithTimeout(context.Background(), 10*time.Second)
if _, pingErr := agent.PingSandbox(pingCtx, connect.NewRequest(&pb.PingSandboxRequest{
SandboxId: sandboxIDStr,
})); pingErr != nil {
revertStatus = "error"
slog.Warn("sandbox gone from agent after failed pause, marking as error", "sandbox_id", sandboxIDStr)
}
pingCancel()
dbCtx, dbCancel := context.WithTimeout(context.Background(), 5*time.Second)
if _, dbErr := s.DB.UpdateSandboxStatus(dbCtx, db.UpdateSandboxStatusParams{
ID: sandboxID, Status: revertStatus,
}); dbErr != nil {
slog.Warn("failed to revert sandbox status after pause error", "sandbox_id", sandboxIDStr, "error", dbErr)
}
dbCancel()
return db.Sandbox{}, fmt.Errorf("agent pause: %w", err)
}
sb, err = s.DB.GetSandbox(ctx, sandboxID)
if err != nil {
return db.Sandbox{}, fmt.Errorf("get sandbox after pause: %w", err)
}
sb.Status = "pausing"
return sb, nil
}
// Resume restores a paused sandbox from snapshot.
func (s *SandboxService) pauseInBackground(sandboxID pgtype.UUID, sandboxIDStr, hostIDStr, teamIDStr string, agent hostagentClient) {
bgCtx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
defer cancel()
// Flush metrics before the VM stops sampling so the persisted history
// covers the entire run-up to the pause.
s.flushAndPersistMetrics(bgCtx, agent, sandboxID, true)
if _, err := agent.PauseSandbox(bgCtx, connect.NewRequest(&pb.PauseSandboxRequest{
SandboxId: sandboxIDStr,
})); err != nil {
slog.Warn("background pause failed", "sandbox_id", sandboxIDStr, "error", err)
// Best-effort: try to recover the sandbox back to "running" so the
// user isn't stuck in "pausing".
if _, dbErr := s.DB.UpdateSandboxStatusIf(bgCtx, db.UpdateSandboxStatusIfParams{
ID: sandboxID, Status: "pausing", Status_2: "running",
}); dbErr != nil {
slog.Warn("failed to recover pausing→running after pause failure", "id", sandboxIDStr, "error", dbErr)
}
s.publishEvent(bgCtx, SandboxStateEvent{
Event: "sandbox.pause_failed", SandboxID: sandboxIDStr, TeamID: teamIDStr, HostID: hostIDStr,
Error: err.Error(), Timestamp: time.Now().Unix(),
})
return
}
if _, err := s.DB.UpdateSandboxStatusIf(bgCtx, db.UpdateSandboxStatusIfParams{
ID: sandboxID, Status: "pausing", Status_2: "paused",
}); err != nil {
slog.Warn("failed to update sandbox to paused", "sandbox_id", sandboxIDStr, "error", err)
}
s.publishEvent(bgCtx, SandboxStateEvent{
Event: "sandbox.paused", SandboxID: sandboxIDStr, TeamID: teamIDStr, HostID: hostIDStr,
Timestamp: time.Now().Unix(),
})
}
// Resume asynchronously resumes a paused sandbox on its original host.
// The DB CAS from "paused" to "resuming" gates concurrent Resume/Destroy.
func (s *SandboxService) Resume(ctx context.Context, sandboxID, teamID pgtype.UUID) (db.Sandbox, error) {
sb, err := s.DB.GetSandboxByTeam(ctx, db.GetSandboxByTeamParams{ID: sandboxID, TeamID: teamID})
if err != nil {
return db.Sandbox{}, fmt.Errorf("sandbox not found: %w", err)
}
if sb.Status != "paused" {
return db.Sandbox{}, fmt.Errorf("sandbox is not paused (status: %s)", sb.Status)
if sb.Status == "running" {
return sb, nil
}
agent, _, err := s.agentForSandbox(ctx, sandboxID)
if _, err := s.DB.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{
ID: sandboxID, Status: "paused", Status_2: "resuming",
}); err != nil {
return db.Sandbox{}, fmt.Errorf("sandbox not in paused state (current: %s)", sb.Status)
}
agent, err := s.agentForHost(ctx, sb.HostID)
if err != nil {
if _, rerr := s.DB.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{
ID: sandboxID, Status: "resuming", Status_2: "paused",
}); rerr != nil {
slog.Warn("failed to roll back resuming→paused", "id", id.FormatSandboxID(sandboxID), "error", rerr)
}
return db.Sandbox{}, err
}
// Look up template defaults so a resumed sandbox has the same env as
// the original Create did.
var defaultUser string
var defaultEnv map[string]string
if tmpl, terr := s.DB.GetTemplate(ctx, sb.TemplateID); terr == nil {
defaultUser = tmpl.DefaultUser
if len(tmpl.DefaultEnv) > 0 {
_ = json.Unmarshal(tmpl.DefaultEnv, &defaultEnv)
}
}
sandboxIDStr := id.FormatSandboxID(sandboxID)
hostIDStr := id.FormatHostID(sb.HostID)
teamIDStr := id.FormatTeamID(sb.TeamID)
// Look up template defaults for resume.
var resumeDefaultUser string
var resumeDefaultEnv map[string]string
if sb.TemplateID.Valid {
tmpl, err := s.DB.GetTemplate(ctx, sb.TemplateID)
if err == nil {
resumeDefaultUser = tmpl.DefaultUser
if len(tmpl.DefaultEnv) > 0 {
_ = json.Unmarshal(tmpl.DefaultEnv, &resumeDefaultEnv)
}
}
}
// Extract kernel version hint from existing sandbox metadata.
var kernelVersion string
if len(sb.Metadata) > 0 {
var meta map[string]string
if err := json.Unmarshal(sb.Metadata, &meta); err == nil {
kernelVersion = meta["kernel_version"]
}
}
resp, err := agent.ResumeSandbox(ctx, connect.NewRequest(&pb.ResumeSandboxRequest{
SandboxId: sandboxIDStr,
TimeoutSec: sb.TimeoutSec,
DefaultUser: resumeDefaultUser,
DefaultEnv: resumeDefaultEnv,
KernelVersion: kernelVersion,
}))
if err != nil {
return db.Sandbox{}, fmt.Errorf("agent resume: %w", err)
}
now := time.Now()
sb, err = s.DB.UpdateSandboxRunning(ctx, db.UpdateSandboxRunningParams{
ID: sandboxID,
HostIp: resp.Msg.HostIp,
GuestIp: "",
StartedAt: pgtype.Timestamptz{
Time: now,
Valid: true,
},
})
if err != nil {
return db.Sandbox{}, fmt.Errorf("update status: %w", err)
}
// Update metadata with actual versions used after resume.
if meta := resp.Msg.Metadata; len(meta) > 0 {
metaJSON, _ := json.Marshal(meta)
if err := s.DB.UpdateSandboxMetadata(ctx, db.UpdateSandboxMetadataParams{
ID: sandboxID,
Metadata: metaJSON,
}); err != nil {
slog.Warn("failed to update sandbox metadata after resume", "id", sandboxIDStr, "error", err)
}
sb.Metadata = metaJSON
}
go s.resumeInBackground(sandboxID, sandboxIDStr, hostIDStr, teamIDStr, agent, sb.TimeoutSec, defaultUser, defaultEnv)
sb.Status = "resuming"
return sb, nil
}
// Destroy stops a sandbox and marks it as stopped.
func (s *SandboxService) resumeInBackground(
sandboxID pgtype.UUID, sandboxIDStr, hostIDStr, teamIDStr string,
agent hostagentClient, timeoutSec int32,
defaultUser string, defaultEnv map[string]string,
) {
bgCtx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
defer cancel()
resp, err := agent.ResumeSandbox(bgCtx, connect.NewRequest(&pb.ResumeSandboxRequest{
SandboxId: sandboxIDStr,
TimeoutSec: timeoutSec,
DefaultUser: defaultUser,
DefaultEnv: defaultEnv,
}))
if err != nil {
slog.Warn("background resume failed", "sandbox_id", sandboxIDStr, "error", err)
if _, dbErr := s.DB.UpdateSandboxStatusIf(bgCtx, db.UpdateSandboxStatusIfParams{
ID: sandboxID, Status: "resuming", Status_2: "paused",
}); dbErr != nil {
slog.Warn("failed to recover resuming→paused after resume failure", "id", sandboxIDStr, "error", dbErr)
}
s.publishEvent(bgCtx, SandboxStateEvent{
Event: "sandbox.resume_failed", SandboxID: sandboxIDStr, TeamID: teamIDStr, HostID: hostIDStr,
Error: err.Error(), Timestamp: time.Now().Unix(),
})
return
}
now := time.Now()
if _, err := s.DB.UpdateSandboxRunningIf(bgCtx, db.UpdateSandboxRunningIfParams{
ID: sandboxID,
Status: "resuming",
HostIp: resp.Msg.HostIp,
StartedAt: pgtype.Timestamptz{Time: now, Valid: true},
}); err != nil {
slog.Warn("failed to update sandbox to running after resume", "id", sandboxIDStr, "error", err)
}
if meta := resp.Msg.Metadata; len(meta) > 0 {
metaJSON, _ := json.Marshal(meta)
if err := s.DB.UpdateSandboxMetadata(bgCtx, db.UpdateSandboxMetadataParams{
ID: sandboxID, Metadata: metaJSON,
}); err != nil {
slog.Warn("failed to store sandbox metadata after resume", "id", sandboxIDStr, "error", err)
}
}
s.publishEvent(bgCtx, SandboxStateEvent{
Event: "sandbox.resumed", SandboxID: sandboxIDStr, TeamID: teamIDStr, HostID: hostIDStr,
HostIP: resp.Msg.HostIp, Metadata: resp.Msg.Metadata,
Timestamp: now.Unix(),
})
}
// CreateSnapshot asynchronously snapshots a running or paused sandbox,
// publishing the result as a new template owned by the sandbox's team. The DB
// CAS from the sandbox's current status to "snapshotting" is the authoritative
// gate against concurrent Pause/Snapshot/Destroy calls; if it loses, no agent
// RPC fires. A running sandbox is snapshotted live (CH briefly paused, then
// resumed); a paused sandbox is snapshotted from its on-disk artefacts without
// reviving the VM. Either way the sandbox returns to its original status on
// completion. Returns the sandbox (now "snapshotting") and the resolved name.
func (s *SandboxService) CreateSnapshot(ctx context.Context, sandboxID, teamID pgtype.UUID, name string) (db.Sandbox, string, error) {
sb, err := s.DB.GetSandboxByTeam(ctx, db.GetSandboxByTeamParams{ID: sandboxID, TeamID: teamID})
if err != nil {
return db.Sandbox{}, "", fmt.Errorf("sandbox not found: %w", err)
}
if sb.Status != "running" && sb.Status != "paused" {
return db.Sandbox{}, "", fmt.Errorf("sandbox is not running or paused (status: %s)", sb.Status)
}
origStatus := sb.Status
if name == "" {
name = id.NewSnapshotName()
}
if err := validate.SafeName(name); err != nil {
return db.Sandbox{}, "", fmt.Errorf("invalid name: %w", err)
}
// Reject duplicate names up front so we don't pause the VM and dump memory
// only to fail on the template insert at the very end.
if _, err := s.DB.GetTemplateByTeam(ctx, db.GetTemplateByTeamParams{Name: name, TeamID: teamID}); err == nil {
return db.Sandbox{}, "", fmt.Errorf("conflict: a snapshot named %q already exists", name)
}
if _, err := s.DB.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{
ID: sandboxID, Status: origStatus, Status_2: "snapshotting",
}); err != nil {
return db.Sandbox{}, "", fmt.Errorf("sandbox not in %s state (current: %s)", origStatus, sb.Status)
}
agent, err := s.agentForHost(ctx, sb.HostID)
if err != nil {
// Roll back the CAS so the sandbox isn't stuck in "snapshotting".
if _, rerr := s.DB.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{
ID: sandboxID, Status: "snapshotting", Status_2: origStatus,
}); rerr != nil {
slog.Warn("failed to roll back snapshotting→"+origStatus, "id", id.FormatSandboxID(sandboxID), "error", rerr)
}
return db.Sandbox{}, "", err
}
sandboxIDStr := id.FormatSandboxID(sandboxID)
hostIDStr := id.FormatHostID(sb.HostID)
teamIDStr := id.FormatTeamID(sb.TeamID)
// Notify other clients that the badge moved to "snapshotting".
s.publishStateChanged(ctx, sandboxIDStr, teamIDStr, hostIDStr, origStatus, "snapshotting")
go s.snapshotInBackground(sandboxID, sandboxIDStr, hostIDStr, teamIDStr, teamID, agent, name, origStatus, sb.Vcpus, sb.MemoryMb)
sb.Status = "snapshotting"
return sb, name, nil
}
func (s *SandboxService) snapshotInBackground(
sandboxID pgtype.UUID, sandboxIDStr, hostIDStr, teamIDStr string, teamID pgtype.UUID,
agent hostagentClient, name, origStatus string, vcpus, memoryMB int32,
) {
bgCtx, cancel := context.WithTimeout(context.Background(), 10*time.Minute)
defer cancel()
newTemplateID := id.NewSandboxID() // any random UUID
templateUUID := pgtype.UUID{Bytes: newTemplateID.Bytes, Valid: true}
resp, err := agent.CreateSnapshot(bgCtx, connect.NewRequest(&pb.CreateSnapshotRequest{
SandboxId: sandboxIDStr,
Name: name,
TeamId: id.UUIDString(teamID),
TemplateId: id.UUIDString(templateUUID),
}))
// Either way, the host-side op is done; return the badge to its original
// status (running for a live snapshot, paused for an on-disk one). Use a CAS
// so a concurrent Destroy (which sets "stopping") wins: if the CAS misses,
// the sandbox is no longer ours and we must NOT announce its old status. The
// snapshot itself is still valid and is registered below — a snapshot
// template outlives its source sandbox.
if _, derr := s.DB.UpdateSandboxStatusIf(bgCtx, db.UpdateSandboxStatusIfParams{
ID: sandboxID, Status: "snapshotting", Status_2: origStatus,
}); derr != nil {
slog.Warn("snapshotting→"+origStatus+" CAS missed (sandbox moved on); skipping state signal", "sandbox_id", sandboxIDStr, "error", derr)
} else {
s.publishStateChanged(bgCtx, sandboxIDStr, teamIDStr, hostIDStr, "snapshotting", origStatus)
}
if err != nil {
slog.Warn("background snapshot failed", "sandbox_id", sandboxIDStr, "error", err)
s.publishEvent(bgCtx, SandboxStateEvent{
Event: "sandbox.snapshot_failed", SandboxID: sandboxIDStr, TeamID: teamIDStr, HostID: hostIDStr,
Metadata: map[string]string{"name": name}, Error: err.Error(), Timestamp: time.Now().Unix(),
})
return
}
if _, err := s.DB.InsertTemplate(bgCtx, db.InsertTemplateParams{
ID: templateUUID,
Name: name,
Type: "snapshot",
Vcpus: vcpus,
MemoryMb: memoryMB,
SizeBytes: resp.Msg.SizeBytes,
TeamID: teamID,
DefaultUser: "",
DefaultEnv: []byte("{}"),
Metadata: []byte("{}"),
}); err != nil {
slog.Warn("failed to insert snapshot template", "sandbox_id", sandboxIDStr, "name", name, "error", err)
s.publishEvent(bgCtx, SandboxStateEvent{
Event: "sandbox.snapshot_failed", SandboxID: sandboxIDStr, TeamID: teamIDStr, HostID: hostIDStr,
Metadata: map[string]string{"name": name}, Error: "failed to register snapshot", Timestamp: time.Now().Unix(),
})
return
}
s.publishEvent(bgCtx, SandboxStateEvent{
Event: "sandbox.snapshotted", SandboxID: sandboxIDStr, TeamID: teamIDStr, HostID: hostIDStr,
Metadata: map[string]string{"name": name}, Timestamp: time.Now().Unix(),
})
}
// publishStateChanged emits a transient capsule.state.changed event so the
// dashboard flips the status badge during a transition that has no terminal
// lifecycle verb of its own (e.g. the snapshotting round-trip).
func (s *SandboxService) publishStateChanged(ctx context.Context, sandboxIDStr, teamIDStr, hostIDStr, from, to string) {
s.publishEvent(ctx, SandboxStateEvent{
Event: "sandbox.state_changed", SandboxID: sandboxIDStr, TeamID: teamIDStr, HostID: hostIDStr,
Metadata: map[string]string{"from": from, "to": to}, Timestamp: time.Now().Unix(),
})
}
// Destroy stops a sandbox asynchronously. Pre-marks the DB status as
// "stopping" and fires the agent RPC in a background goroutine.
func (s *SandboxService) Destroy(ctx context.Context, sandboxID, teamID pgtype.UUID) error {
sb, err := s.DB.GetSandboxByTeam(ctx, db.GetSandboxByTeamParams{ID: sandboxID, TeamID: teamID})
if err != nil {
return fmt.Errorf("sandbox not found: %w", err)
}
if sb.Status == "stopped" || sb.Status == "error" {
return nil
}
agent, _, err := s.agentForSandbox(ctx, sandboxID)
if err != nil {
@ -361,35 +616,54 @@ func (s *SandboxService) Destroy(ctx context.Context, sandboxID, teamID pgtype.U
}
sandboxIDStr := id.FormatSandboxID(sandboxID)
hostIDStr := id.FormatHostID(sb.HostID)
teamIDStr := id.FormatTeamID(sb.TeamID)
prevStatus := sb.Status
// If running, flush 24h tier metrics for analytics before destroying.
if sb.Status == "running" {
s.flushAndPersistMetrics(ctx, agent, sandboxID, false)
if _, err := s.DB.UpdateSandboxStatus(ctx, db.UpdateSandboxStatusParams{
ID: sandboxID, Status: "stopping",
}); err != nil {
return fmt.Errorf("pre-mark stopping: %w", err)
}
// Destroy on host agent. A not-found response is fine — sandbox is already gone.
if _, err := agent.DestroySandbox(ctx, connect.NewRequest(&pb.DestroySandboxRequest{
go s.destroyInBackground(sandboxID, sandboxIDStr, hostIDStr, teamIDStr, agent, prevStatus)
return nil
}
func (s *SandboxService) destroyInBackground(sandboxID pgtype.UUID, sandboxIDStr, hostIDStr, teamIDStr string, agent hostagentClient, prevStatus string) {
bgCtx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
defer cancel()
if prevStatus == "running" || prevStatus == "pausing" {
s.flushAndPersistMetrics(bgCtx, agent, sandboxID, false)
}
if _, err := agent.DestroySandbox(bgCtx, connect.NewRequest(&pb.DestroySandboxRequest{
SandboxId: sandboxIDStr,
})); err != nil && connect.CodeOf(err) != connect.CodeNotFound {
return fmt.Errorf("agent destroy: %w", err)
slog.Warn("background destroy failed", "sandbox_id", sandboxIDStr, "error", err)
}
// For a paused sandbox, only keep 24h tier; remove the finer-grained tiers.
if sb.Status == "paused" {
_ = s.DB.DeleteSandboxMetricPointsByTier(ctx, db.DeleteSandboxMetricPointsByTierParams{
if prevStatus == "paused" {
_ = s.DB.DeleteSandboxMetricPointsByTier(bgCtx, db.DeleteSandboxMetricPointsByTierParams{
SandboxID: sandboxID, Tier: "10m",
})
_ = s.DB.DeleteSandboxMetricPointsByTier(ctx, db.DeleteSandboxMetricPointsByTierParams{
_ = s.DB.DeleteSandboxMetricPointsByTier(bgCtx, db.DeleteSandboxMetricPointsByTierParams{
SandboxID: sandboxID, Tier: "2h",
})
}
if _, err := s.DB.UpdateSandboxStatus(ctx, db.UpdateSandboxStatusParams{
ID: sandboxID, Status: "stopped",
if _, err := s.DB.UpdateSandboxStatusIf(bgCtx, db.UpdateSandboxStatusIfParams{
ID: sandboxID, Status: "stopping", Status_2: "stopped",
}); err != nil {
return fmt.Errorf("update status: %w", err)
slog.Warn("failed to update sandbox to stopped", "sandbox_id", sandboxIDStr, "error", err)
}
return nil
s.publishEvent(bgCtx, SandboxStateEvent{
Event: "sandbox.stopped", SandboxID: sandboxIDStr, TeamID: teamIDStr, HostID: hostIDStr,
Timestamp: time.Now().Unix(),
})
}
// flushAndPersistMetrics calls FlushSandboxMetrics on the agent and stores
@ -429,6 +703,40 @@ func (s *SandboxService) persistMetricPoints(ctx context.Context, sandboxID pgty
}
}
// GetDiskUsage returns the current disk usage in bytes for a sandbox.
// For running or paused sandboxes, it queries the host agent for live data.
// For other states or when the agent is unreachable, it falls back to the
// last known metric point from the database.
func (s *SandboxService) GetDiskUsage(ctx context.Context, sandboxID, teamID pgtype.UUID) (int64, error) {
sb, err := s.DB.GetSandboxByTeam(ctx, db.GetSandboxByTeamParams{ID: sandboxID, TeamID: teamID})
if err != nil {
return 0, fmt.Errorf("sandbox not found: %w", err)
}
// For running or paused sandboxes, try the agent for live disk usage.
if sb.Status == "running" || sb.Status == "paused" {
sandboxIDStr := id.FormatSandboxID(sandboxID)
agent, hostErr := s.agentForHost(ctx, sb.HostID)
if hostErr == nil {
resp, err := agent.GetSandboxMetrics(ctx, connect.NewRequest(&pb.GetSandboxMetricsRequest{
SandboxId: sandboxIDStr,
Range: "5m",
}))
if err == nil && len(resp.Msg.Points) > 0 {
last := resp.Msg.Points[len(resp.Msg.Points)-1]
return last.DiskBytes, nil
}
}
}
// Fallback: query the database for the last known metric point.
point, err := s.DB.GetLatestSandboxMetricPoint(ctx, sandboxID)
if err != nil {
return 0, err
}
return point.DiskBytes, nil
}
// Ping resets the inactivity timer for a running sandbox.
func (s *SandboxService) Ping(ctx context.Context, sandboxID, teamID pgtype.UUID) error {
sb, err := s.DB.GetSandboxByTeam(ctx, db.GetSandboxByTeamParams{ID: sandboxID, TeamID: teamID})

View File

@ -479,6 +479,8 @@ type AdminTeamRow struct {
OwnerEmail string
ActiveSandboxCount int32
ChannelCount int32
RunningVcpus int32
RunningMemoryMb int32
}
// AdminListTeams returns a paginated list of all teams (excluding the platform
@ -511,6 +513,8 @@ func (s *TeamService) AdminListTeams(ctx context.Context, limit, offset int32) (
OwnerEmail: t.OwnerEmail,
ActiveSandboxCount: t.ActiveSandboxCount,
ChannelCount: t.ChannelCount,
RunningVcpus: t.RunningVcpus,
RunningMemoryMb: t.RunningMemoryMb,
}
if t.DeletedAt.Valid {
deletedAt := t.DeletedAt.Time