forked from wrenn/wrenn
Add template build system with admin panel, async workers, and FlattenRootfs RPC
Introduces an end-to-end template building pipeline: admins submit a recipe
(list of shell commands) via the dashboard, a Redis-backed worker pool spins
up a sandbox, executes each command, and produces either a full snapshot
(with healthcheck) or an image-only template (rootfs flattened via a new
FlattenRootfs host-agent RPC). Build progress and per-step logs are persisted
to a new template_builds table and polled by the frontend.
Backend:
- New FlattenRootfs RPC (proto + host agent + sandbox manager)
- BuildService with Redis queue (BLPOP) and configurable worker pool (default 2)
- Admin-only REST endpoints: POST/GET /v1/admin/builds, GET /v1/admin/builds/{id}
- Migration for template_builds table with JSONB logs and recipe columns
- sqlc queries for build CRUD and progress updates
Frontend:
- /admin/templates page with Templates + Builds tabs
- Create Template dialog with recipe textarea, healthcheck, specs
- Build history with expandable per-step logs, status badges, progress bars
- Auto-polling every 3s for active builds
- AdminSidebar updated with Templates nav item
This commit is contained in:
156
internal/api/handlers_builds.go
Normal file
156
internal/api/handlers_builds.go
Normal file
@ -0,0 +1,156 @@
|
||||
package api
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"time"
|
||||
|
||||
"github.com/go-chi/chi/v5"
|
||||
|
||||
"git.omukk.dev/wrenn/sandbox/internal/db"
|
||||
"git.omukk.dev/wrenn/sandbox/internal/service"
|
||||
"git.omukk.dev/wrenn/sandbox/internal/validate"
|
||||
)
|
||||
|
||||
type buildHandler struct {
|
||||
svc *service.BuildService
|
||||
}
|
||||
|
||||
func newBuildHandler(svc *service.BuildService) *buildHandler {
|
||||
return &buildHandler{svc: svc}
|
||||
}
|
||||
|
||||
type createBuildRequest struct {
|
||||
Name string `json:"name"`
|
||||
BaseTemplate string `json:"base_template"`
|
||||
Recipe []string `json:"recipe"`
|
||||
Healthcheck string `json:"healthcheck"`
|
||||
VCPUs int32 `json:"vcpus"`
|
||||
MemoryMB int32 `json:"memory_mb"`
|
||||
}
|
||||
|
||||
type buildResponse struct {
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
BaseTemplate string `json:"base_template"`
|
||||
Recipe json.RawMessage `json:"recipe"`
|
||||
Healthcheck *string `json:"healthcheck,omitempty"`
|
||||
VCPUs int32 `json:"vcpus"`
|
||||
MemoryMB int32 `json:"memory_mb"`
|
||||
Status string `json:"status"`
|
||||
CurrentStep int32 `json:"current_step"`
|
||||
TotalSteps int32 `json:"total_steps"`
|
||||
Logs json.RawMessage `json:"logs"`
|
||||
Error *string `json:"error,omitempty"`
|
||||
SandboxID *string `json:"sandbox_id,omitempty"`
|
||||
HostID *string `json:"host_id,omitempty"`
|
||||
CreatedAt string `json:"created_at"`
|
||||
StartedAt *string `json:"started_at,omitempty"`
|
||||
CompletedAt *string `json:"completed_at,omitempty"`
|
||||
}
|
||||
|
||||
func buildToResponse(b db.TemplateBuild) buildResponse {
|
||||
resp := buildResponse{
|
||||
ID: b.ID,
|
||||
Name: b.Name,
|
||||
BaseTemplate: b.BaseTemplate,
|
||||
Recipe: b.Recipe,
|
||||
VCPUs: b.Vcpus,
|
||||
MemoryMB: b.MemoryMb,
|
||||
Status: b.Status,
|
||||
CurrentStep: b.CurrentStep,
|
||||
TotalSteps: b.TotalSteps,
|
||||
Logs: b.Logs,
|
||||
}
|
||||
if b.Healthcheck.Valid {
|
||||
resp.Healthcheck = &b.Healthcheck.String
|
||||
}
|
||||
if b.Error.Valid {
|
||||
resp.Error = &b.Error.String
|
||||
}
|
||||
if b.SandboxID.Valid {
|
||||
resp.SandboxID = &b.SandboxID.String
|
||||
}
|
||||
if b.HostID.Valid {
|
||||
resp.HostID = &b.HostID.String
|
||||
}
|
||||
if b.CreatedAt.Valid {
|
||||
resp.CreatedAt = b.CreatedAt.Time.Format(time.RFC3339)
|
||||
}
|
||||
if b.StartedAt.Valid {
|
||||
s := b.StartedAt.Time.Format(time.RFC3339)
|
||||
resp.StartedAt = &s
|
||||
}
|
||||
if b.CompletedAt.Valid {
|
||||
s := b.CompletedAt.Time.Format(time.RFC3339)
|
||||
resp.CompletedAt = &s
|
||||
}
|
||||
return resp
|
||||
}
|
||||
|
||||
// Create handles POST /v1/admin/builds.
|
||||
func (h *buildHandler) Create(w http.ResponseWriter, r *http.Request) {
|
||||
var req createBuildRequest
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
writeError(w, http.StatusBadRequest, "invalid_request", "invalid JSON body")
|
||||
return
|
||||
}
|
||||
|
||||
if req.Name == "" {
|
||||
writeError(w, http.StatusBadRequest, "invalid_request", "name is required")
|
||||
return
|
||||
}
|
||||
if err := validate.SafeName(req.Name); err != nil {
|
||||
writeError(w, http.StatusBadRequest, "invalid_request", fmt.Sprintf("invalid template name: %s", err))
|
||||
return
|
||||
}
|
||||
if len(req.Recipe) == 0 {
|
||||
writeError(w, http.StatusBadRequest, "invalid_request", "recipe must contain at least one command")
|
||||
return
|
||||
}
|
||||
|
||||
build, err := h.svc.Create(r.Context(), service.BuildCreateParams{
|
||||
Name: req.Name,
|
||||
BaseTemplate: req.BaseTemplate,
|
||||
Recipe: req.Recipe,
|
||||
Healthcheck: req.Healthcheck,
|
||||
VCPUs: req.VCPUs,
|
||||
MemoryMB: req.MemoryMB,
|
||||
})
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, "build_error", err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
writeJSON(w, http.StatusCreated, buildToResponse(build))
|
||||
}
|
||||
|
||||
// List handles GET /v1/admin/builds.
|
||||
func (h *buildHandler) List(w http.ResponseWriter, r *http.Request) {
|
||||
builds, err := h.svc.List(r.Context())
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, "db_error", "failed to list builds")
|
||||
return
|
||||
}
|
||||
|
||||
resp := make([]buildResponse, len(builds))
|
||||
for i, b := range builds {
|
||||
resp[i] = buildToResponse(b)
|
||||
}
|
||||
|
||||
writeJSON(w, http.StatusOK, resp)
|
||||
}
|
||||
|
||||
// Get handles GET /v1/admin/builds/{id}.
|
||||
func (h *buildHandler) Get(w http.ResponseWriter, r *http.Request) {
|
||||
buildID := chi.URLParam(r, "id")
|
||||
|
||||
build, err := h.svc.Get(r.Context(), buildID)
|
||||
if err != nil {
|
||||
writeError(w, http.StatusNotFound, "not_found", "build not found")
|
||||
return
|
||||
}
|
||||
|
||||
writeJSON(w, http.StatusOK, buildToResponse(build))
|
||||
}
|
||||
@ -22,7 +22,8 @@ var openapiYAML []byte
|
||||
|
||||
// Server is the control plane HTTP server.
|
||||
type Server struct {
|
||||
router chi.Router
|
||||
router chi.Router
|
||||
BuildSvc *service.BuildService
|
||||
}
|
||||
|
||||
// New constructs the chi router and registers all routes.
|
||||
@ -47,6 +48,7 @@ func New(
|
||||
teamSvc := &service.TeamService{DB: queries, Pool: pgPool, HostPool: pool}
|
||||
auditSvc := &service.AuditService{DB: queries}
|
||||
statsSvc := &service.StatsService{DB: queries, Pool: pgPool}
|
||||
buildSvc := &service.BuildService{DB: queries, Redis: rdb, Pool: pool, Scheduler: sched}
|
||||
|
||||
al := audit.New(queries)
|
||||
|
||||
@ -65,6 +67,7 @@ func New(
|
||||
auditH := newAuditHandler(auditSvc)
|
||||
statsH := newStatsHandler(statsSvc)
|
||||
metricsH := newSandboxMetricsHandler(queries, pool)
|
||||
buildH := newBuildHandler(buildSvc)
|
||||
|
||||
// OpenAPI spec and docs.
|
||||
r.Get("/openapi.yaml", serveOpenAPI)
|
||||
@ -174,9 +177,12 @@ func New(
|
||||
r.Use(requireJWT(jwtSecret))
|
||||
r.Use(requireAdmin(queries))
|
||||
r.Put("/teams/{id}/byoc", teamH.SetBYOC)
|
||||
r.Post("/builds", buildH.Create)
|
||||
r.Get("/builds", buildH.List)
|
||||
r.Get("/builds/{id}", buildH.Get)
|
||||
})
|
||||
|
||||
return &Server{router: r}
|
||||
return &Server{router: r, BuildSvc: buildSvc}
|
||||
}
|
||||
|
||||
// Handler returns the HTTP handler.
|
||||
|
||||
@ -147,6 +147,26 @@ type Template struct {
|
||||
TeamID string `json:"team_id"`
|
||||
}
|
||||
|
||||
type TemplateBuild struct {
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
BaseTemplate string `json:"base_template"`
|
||||
Recipe []byte `json:"recipe"`
|
||||
Healthcheck pgtype.Text `json:"healthcheck"`
|
||||
Vcpus int32 `json:"vcpus"`
|
||||
MemoryMb int32 `json:"memory_mb"`
|
||||
Status string `json:"status"`
|
||||
CurrentStep int32 `json:"current_step"`
|
||||
TotalSteps int32 `json:"total_steps"`
|
||||
Logs []byte `json:"logs"`
|
||||
Error pgtype.Text `json:"error"`
|
||||
SandboxID pgtype.Text `json:"sandbox_id"`
|
||||
HostID pgtype.Text `json:"host_id"`
|
||||
CreatedAt pgtype.Timestamptz `json:"created_at"`
|
||||
StartedAt pgtype.Timestamptz `json:"started_at"`
|
||||
CompletedAt pgtype.Timestamptz `json:"completed_at"`
|
||||
}
|
||||
|
||||
type User struct {
|
||||
ID string `json:"id"`
|
||||
Email string `json:"email"`
|
||||
|
||||
223
internal/db/template_builds.sql.go
Normal file
223
internal/db/template_builds.sql.go
Normal file
@ -0,0 +1,223 @@
|
||||
// Code generated by sqlc. DO NOT EDIT.
|
||||
// versions:
|
||||
// sqlc v1.30.0
|
||||
// source: template_builds.sql
|
||||
|
||||
package db
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
"github.com/jackc/pgx/v5/pgtype"
|
||||
)
|
||||
|
||||
const getTemplateBuild = `-- name: GetTemplateBuild :one
|
||||
SELECT id, name, base_template, recipe, healthcheck, vcpus, memory_mb, status, current_step, total_steps, logs, error, sandbox_id, host_id, created_at, started_at, completed_at FROM template_builds WHERE id = $1
|
||||
`
|
||||
|
||||
func (q *Queries) GetTemplateBuild(ctx context.Context, id string) (TemplateBuild, error) {
|
||||
row := q.db.QueryRow(ctx, getTemplateBuild, id)
|
||||
var i TemplateBuild
|
||||
err := row.Scan(
|
||||
&i.ID,
|
||||
&i.Name,
|
||||
&i.BaseTemplate,
|
||||
&i.Recipe,
|
||||
&i.Healthcheck,
|
||||
&i.Vcpus,
|
||||
&i.MemoryMb,
|
||||
&i.Status,
|
||||
&i.CurrentStep,
|
||||
&i.TotalSteps,
|
||||
&i.Logs,
|
||||
&i.Error,
|
||||
&i.SandboxID,
|
||||
&i.HostID,
|
||||
&i.CreatedAt,
|
||||
&i.StartedAt,
|
||||
&i.CompletedAt,
|
||||
)
|
||||
return i, err
|
||||
}
|
||||
|
||||
const insertTemplateBuild = `-- name: InsertTemplateBuild :one
|
||||
INSERT INTO template_builds (id, name, base_template, recipe, healthcheck, vcpus, memory_mb, status, total_steps)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, 'pending', $8)
|
||||
RETURNING id, name, base_template, recipe, healthcheck, vcpus, memory_mb, status, current_step, total_steps, logs, error, sandbox_id, host_id, created_at, started_at, completed_at
|
||||
`
|
||||
|
||||
type InsertTemplateBuildParams struct {
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
BaseTemplate string `json:"base_template"`
|
||||
Recipe []byte `json:"recipe"`
|
||||
Healthcheck pgtype.Text `json:"healthcheck"`
|
||||
Vcpus int32 `json:"vcpus"`
|
||||
MemoryMb int32 `json:"memory_mb"`
|
||||
TotalSteps int32 `json:"total_steps"`
|
||||
}
|
||||
|
||||
func (q *Queries) InsertTemplateBuild(ctx context.Context, arg InsertTemplateBuildParams) (TemplateBuild, error) {
|
||||
row := q.db.QueryRow(ctx, insertTemplateBuild,
|
||||
arg.ID,
|
||||
arg.Name,
|
||||
arg.BaseTemplate,
|
||||
arg.Recipe,
|
||||
arg.Healthcheck,
|
||||
arg.Vcpus,
|
||||
arg.MemoryMb,
|
||||
arg.TotalSteps,
|
||||
)
|
||||
var i TemplateBuild
|
||||
err := row.Scan(
|
||||
&i.ID,
|
||||
&i.Name,
|
||||
&i.BaseTemplate,
|
||||
&i.Recipe,
|
||||
&i.Healthcheck,
|
||||
&i.Vcpus,
|
||||
&i.MemoryMb,
|
||||
&i.Status,
|
||||
&i.CurrentStep,
|
||||
&i.TotalSteps,
|
||||
&i.Logs,
|
||||
&i.Error,
|
||||
&i.SandboxID,
|
||||
&i.HostID,
|
||||
&i.CreatedAt,
|
||||
&i.StartedAt,
|
||||
&i.CompletedAt,
|
||||
)
|
||||
return i, err
|
||||
}
|
||||
|
||||
const listTemplateBuilds = `-- name: ListTemplateBuilds :many
|
||||
SELECT id, name, base_template, recipe, healthcheck, vcpus, memory_mb, status, current_step, total_steps, logs, error, sandbox_id, host_id, created_at, started_at, completed_at FROM template_builds ORDER BY created_at DESC
|
||||
`
|
||||
|
||||
func (q *Queries) ListTemplateBuilds(ctx context.Context) ([]TemplateBuild, error) {
|
||||
rows, err := q.db.Query(ctx, listTemplateBuilds)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
var items []TemplateBuild
|
||||
for rows.Next() {
|
||||
var i TemplateBuild
|
||||
if err := rows.Scan(
|
||||
&i.ID,
|
||||
&i.Name,
|
||||
&i.BaseTemplate,
|
||||
&i.Recipe,
|
||||
&i.Healthcheck,
|
||||
&i.Vcpus,
|
||||
&i.MemoryMb,
|
||||
&i.Status,
|
||||
&i.CurrentStep,
|
||||
&i.TotalSteps,
|
||||
&i.Logs,
|
||||
&i.Error,
|
||||
&i.SandboxID,
|
||||
&i.HostID,
|
||||
&i.CreatedAt,
|
||||
&i.StartedAt,
|
||||
&i.CompletedAt,
|
||||
); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
items = append(items, i)
|
||||
}
|
||||
if err := rows.Err(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return items, nil
|
||||
}
|
||||
|
||||
const updateBuildError = `-- name: UpdateBuildError :exec
|
||||
UPDATE template_builds
|
||||
SET error = $2, status = 'failed', completed_at = NOW()
|
||||
WHERE id = $1
|
||||
`
|
||||
|
||||
type UpdateBuildErrorParams struct {
|
||||
ID string `json:"id"`
|
||||
Error pgtype.Text `json:"error"`
|
||||
}
|
||||
|
||||
func (q *Queries) UpdateBuildError(ctx context.Context, arg UpdateBuildErrorParams) error {
|
||||
_, err := q.db.Exec(ctx, updateBuildError, arg.ID, arg.Error)
|
||||
return err
|
||||
}
|
||||
|
||||
const updateBuildProgress = `-- name: UpdateBuildProgress :exec
|
||||
UPDATE template_builds
|
||||
SET current_step = $2, logs = $3
|
||||
WHERE id = $1
|
||||
`
|
||||
|
||||
type UpdateBuildProgressParams struct {
|
||||
ID string `json:"id"`
|
||||
CurrentStep int32 `json:"current_step"`
|
||||
Logs []byte `json:"logs"`
|
||||
}
|
||||
|
||||
func (q *Queries) UpdateBuildProgress(ctx context.Context, arg UpdateBuildProgressParams) error {
|
||||
_, err := q.db.Exec(ctx, updateBuildProgress, arg.ID, arg.CurrentStep, arg.Logs)
|
||||
return err
|
||||
}
|
||||
|
||||
const updateBuildSandbox = `-- name: UpdateBuildSandbox :exec
|
||||
UPDATE template_builds
|
||||
SET sandbox_id = $2, host_id = $3
|
||||
WHERE id = $1
|
||||
`
|
||||
|
||||
type UpdateBuildSandboxParams struct {
|
||||
ID string `json:"id"`
|
||||
SandboxID pgtype.Text `json:"sandbox_id"`
|
||||
HostID pgtype.Text `json:"host_id"`
|
||||
}
|
||||
|
||||
func (q *Queries) UpdateBuildSandbox(ctx context.Context, arg UpdateBuildSandboxParams) error {
|
||||
_, err := q.db.Exec(ctx, updateBuildSandbox, arg.ID, arg.SandboxID, arg.HostID)
|
||||
return err
|
||||
}
|
||||
|
||||
const updateBuildStatus = `-- name: UpdateBuildStatus :one
|
||||
UPDATE template_builds
|
||||
SET status = $2,
|
||||
started_at = CASE WHEN $2 = 'running' AND started_at IS NULL THEN NOW() ELSE started_at END,
|
||||
completed_at = CASE WHEN $2 IN ('success', 'failed') THEN NOW() ELSE completed_at END
|
||||
WHERE id = $1
|
||||
RETURNING id, name, base_template, recipe, healthcheck, vcpus, memory_mb, status, current_step, total_steps, logs, error, sandbox_id, host_id, created_at, started_at, completed_at
|
||||
`
|
||||
|
||||
type UpdateBuildStatusParams struct {
|
||||
ID string `json:"id"`
|
||||
Status string `json:"status"`
|
||||
}
|
||||
|
||||
func (q *Queries) UpdateBuildStatus(ctx context.Context, arg UpdateBuildStatusParams) (TemplateBuild, error) {
|
||||
row := q.db.QueryRow(ctx, updateBuildStatus, arg.ID, arg.Status)
|
||||
var i TemplateBuild
|
||||
err := row.Scan(
|
||||
&i.ID,
|
||||
&i.Name,
|
||||
&i.BaseTemplate,
|
||||
&i.Recipe,
|
||||
&i.Healthcheck,
|
||||
&i.Vcpus,
|
||||
&i.MemoryMb,
|
||||
&i.Status,
|
||||
&i.CurrentStep,
|
||||
&i.TotalSteps,
|
||||
&i.Logs,
|
||||
&i.Error,
|
||||
&i.SandboxID,
|
||||
&i.HostID,
|
||||
&i.CreatedAt,
|
||||
&i.StartedAt,
|
||||
&i.CompletedAt,
|
||||
)
|
||||
return i, err
|
||||
}
|
||||
@ -110,6 +110,19 @@ func (s *Server) DeleteSnapshot(
|
||||
return connect.NewResponse(&pb.DeleteSnapshotResponse{}), nil
|
||||
}
|
||||
|
||||
func (s *Server) FlattenRootfs(
|
||||
ctx context.Context,
|
||||
req *connect.Request[pb.FlattenRootfsRequest],
|
||||
) (*connect.Response[pb.FlattenRootfsResponse], error) {
|
||||
sizeBytes, err := s.mgr.FlattenRootfs(ctx, req.Msg.SandboxId, req.Msg.Name)
|
||||
if err != nil {
|
||||
return nil, connect.NewError(connect.CodeInternal, fmt.Errorf("flatten rootfs: %w", err))
|
||||
}
|
||||
return connect.NewResponse(&pb.FlattenRootfsResponse{
|
||||
SizeBytes: sizeBytes,
|
||||
}), nil
|
||||
}
|
||||
|
||||
func (s *Server) PingSandbox(
|
||||
ctx context.Context,
|
||||
req *connect.Request[pb.PingSandboxRequest],
|
||||
|
||||
@ -78,6 +78,11 @@ func NewAuditLogID() string {
|
||||
return "log-" + hex8()
|
||||
}
|
||||
|
||||
// NewBuildID generates a new template build ID in the format "bld-" + 8 hex chars.
|
||||
func NewBuildID() string {
|
||||
return "bld-" + hex8()
|
||||
}
|
||||
|
||||
// NewRefreshToken generates a 64-char hex token (32 bytes of entropy) for use as a host refresh token.
|
||||
func NewRefreshToken() string {
|
||||
b := make([]byte, 32)
|
||||
|
||||
@ -795,6 +795,88 @@ func (m *Manager) CreateSnapshot(ctx context.Context, sandboxID, name string) (i
|
||||
return sizeBytes, nil
|
||||
}
|
||||
|
||||
// FlattenRootfs stops a running sandbox, flattens its device-mapper CoW
|
||||
// rootfs into a standalone rootfs.ext4, and cleans up all resources.
|
||||
// The result is an image-only template (no VM memory/CPU state) stored in
|
||||
// ImagesDir/{name}/rootfs.ext4.
|
||||
func (m *Manager) FlattenRootfs(ctx context.Context, sandboxID, name string) (int64, error) {
|
||||
if err := validate.SafeName(name); err != nil {
|
||||
return 0, fmt.Errorf("invalid template name: %w", err)
|
||||
}
|
||||
|
||||
m.mu.Lock()
|
||||
sb, ok := m.boxes[sandboxID]
|
||||
if ok {
|
||||
delete(m.boxes, sandboxID)
|
||||
}
|
||||
m.mu.Unlock()
|
||||
|
||||
if !ok {
|
||||
return 0, fmt.Errorf("sandbox %s not found", sandboxID)
|
||||
}
|
||||
|
||||
// Stop the VM but keep the dm device alive for flattening.
|
||||
m.stopSampler(sb)
|
||||
if err := m.vm.Destroy(ctx, sb.ID); err != nil {
|
||||
slog.Warn("vm destroy error during flatten", "id", sb.ID, "error", err)
|
||||
}
|
||||
|
||||
// Release network resources — not needed after VM is stopped.
|
||||
if err := network.RemoveNetwork(sb.slot); err != nil {
|
||||
slog.Warn("network cleanup error during flatten", "id", sb.ID, "error", err)
|
||||
}
|
||||
m.slots.Release(sb.SlotIndex)
|
||||
|
||||
if sb.uffdSocketPath != "" {
|
||||
os.Remove(sb.uffdSocketPath)
|
||||
}
|
||||
|
||||
// Create template directory and flatten the dm-snapshot.
|
||||
if err := snapshot.EnsureDir(m.cfg.ImagesDir, name); err != nil {
|
||||
m.cleanupDM(sb)
|
||||
return 0, fmt.Errorf("create template dir: %w", err)
|
||||
}
|
||||
|
||||
outputPath := snapshot.RootfsPath(m.cfg.ImagesDir, name)
|
||||
if sb.dmDevice == nil {
|
||||
return 0, fmt.Errorf("sandbox %s has no dm device", sandboxID)
|
||||
}
|
||||
|
||||
if err := devicemapper.FlattenSnapshot(sb.dmDevice.DevicePath, outputPath); err != nil {
|
||||
m.cleanupDM(sb)
|
||||
warnErr("template dir cleanup error", name, snapshot.Remove(m.cfg.ImagesDir, name))
|
||||
return 0, fmt.Errorf("flatten rootfs: %w", err)
|
||||
}
|
||||
|
||||
// Clean up dm device and loop device now that flatten is complete.
|
||||
m.cleanupDM(sb)
|
||||
|
||||
sizeBytes, err := snapshot.DirSize(m.cfg.ImagesDir, name)
|
||||
if err != nil {
|
||||
slog.Warn("failed to calculate template size", "error", err)
|
||||
}
|
||||
|
||||
slog.Info("rootfs flattened to image-only template",
|
||||
"sandbox", sandboxID,
|
||||
"name", name,
|
||||
"size_bytes", sizeBytes,
|
||||
)
|
||||
return sizeBytes, nil
|
||||
}
|
||||
|
||||
// cleanupDM tears down the dm-snapshot device and releases the base image loop device.
|
||||
func (m *Manager) cleanupDM(sb *sandboxState) {
|
||||
if sb.dmDevice != nil {
|
||||
if err := devicemapper.RemoveSnapshot(context.Background(), sb.dmDevice); err != nil {
|
||||
slog.Warn("dm-snapshot remove error", "id", sb.ID, "error", err)
|
||||
}
|
||||
os.Remove(sb.dmDevice.CowPath)
|
||||
}
|
||||
if sb.baseImagePath != "" {
|
||||
m.loops.Release(sb.baseImagePath)
|
||||
}
|
||||
}
|
||||
|
||||
// DeleteSnapshot removes a snapshot template from disk.
|
||||
func (m *Manager) DeleteSnapshot(name string) error {
|
||||
if err := validate.SafeName(name); err != nil {
|
||||
|
||||
385
internal/service/build.go
Normal file
385
internal/service/build.go
Normal file
@ -0,0 +1,385 @@
|
||||
package service
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"time"
|
||||
|
||||
"connectrpc.com/connect"
|
||||
"github.com/jackc/pgx/v5/pgtype"
|
||||
"github.com/redis/go-redis/v9"
|
||||
|
||||
"git.omukk.dev/wrenn/sandbox/internal/db"
|
||||
"git.omukk.dev/wrenn/sandbox/internal/id"
|
||||
"git.omukk.dev/wrenn/sandbox/internal/lifecycle"
|
||||
"git.omukk.dev/wrenn/sandbox/internal/scheduler"
|
||||
pb "git.omukk.dev/wrenn/sandbox/proto/hostagent/gen"
|
||||
)
|
||||
|
||||
const (
|
||||
buildQueueKey = "wrenn:build_queue"
|
||||
buildCommandTimeout = 30 * time.Second
|
||||
healthcheckInterval = 1 * time.Second
|
||||
healthcheckTimeout = 60 * time.Second
|
||||
platformTeamID = "platform"
|
||||
)
|
||||
|
||||
// buildAgentClient is the subset of the host agent client used by the build worker.
|
||||
type buildAgentClient interface {
|
||||
CreateSandbox(ctx context.Context, req *connect.Request[pb.CreateSandboxRequest]) (*connect.Response[pb.CreateSandboxResponse], error)
|
||||
DestroySandbox(ctx context.Context, req *connect.Request[pb.DestroySandboxRequest]) (*connect.Response[pb.DestroySandboxResponse], error)
|
||||
Exec(ctx context.Context, req *connect.Request[pb.ExecRequest]) (*connect.Response[pb.ExecResponse], error)
|
||||
CreateSnapshot(ctx context.Context, req *connect.Request[pb.CreateSnapshotRequest]) (*connect.Response[pb.CreateSnapshotResponse], error)
|
||||
FlattenRootfs(ctx context.Context, req *connect.Request[pb.FlattenRootfsRequest]) (*connect.Response[pb.FlattenRootfsResponse], error)
|
||||
}
|
||||
|
||||
// BuildLogEntry represents a single entry in the build log JSONB array.
|
||||
type BuildLogEntry struct {
|
||||
Step int `json:"step"`
|
||||
Cmd string `json:"cmd"`
|
||||
Stdout string `json:"stdout"`
|
||||
Stderr string `json:"stderr"`
|
||||
Exit int32 `json:"exit"`
|
||||
Ok bool `json:"ok"`
|
||||
Elapsed int64 `json:"elapsed_ms"`
|
||||
}
|
||||
|
||||
// BuildService handles template build orchestration.
|
||||
type BuildService struct {
|
||||
DB *db.Queries
|
||||
Redis *redis.Client
|
||||
Pool *lifecycle.HostClientPool
|
||||
Scheduler scheduler.HostScheduler
|
||||
}
|
||||
|
||||
// BuildCreateParams holds the parameters for creating a template build.
|
||||
type BuildCreateParams struct {
|
||||
Name string
|
||||
BaseTemplate string
|
||||
Recipe []string
|
||||
Healthcheck string
|
||||
VCPUs int32
|
||||
MemoryMB int32
|
||||
}
|
||||
|
||||
// Create inserts a new build record and enqueues it to Redis.
|
||||
func (s *BuildService) Create(ctx context.Context, p BuildCreateParams) (db.TemplateBuild, error) {
|
||||
if p.BaseTemplate == "" {
|
||||
p.BaseTemplate = "minimal"
|
||||
}
|
||||
if p.VCPUs <= 0 {
|
||||
p.VCPUs = 1
|
||||
}
|
||||
if p.MemoryMB <= 0 {
|
||||
p.MemoryMB = 512
|
||||
}
|
||||
|
||||
recipeJSON, err := json.Marshal(p.Recipe)
|
||||
if err != nil {
|
||||
return db.TemplateBuild{}, fmt.Errorf("marshal recipe: %w", err)
|
||||
}
|
||||
|
||||
buildID := id.NewBuildID()
|
||||
|
||||
build, err := s.DB.InsertTemplateBuild(ctx, db.InsertTemplateBuildParams{
|
||||
ID: buildID,
|
||||
Name: p.Name,
|
||||
BaseTemplate: p.BaseTemplate,
|
||||
Recipe: recipeJSON,
|
||||
Healthcheck: pgtype.Text{String: p.Healthcheck, Valid: p.Healthcheck != ""},
|
||||
Vcpus: p.VCPUs,
|
||||
MemoryMb: p.MemoryMB,
|
||||
TotalSteps: int32(len(p.Recipe)),
|
||||
})
|
||||
if err != nil {
|
||||
return db.TemplateBuild{}, fmt.Errorf("insert build: %w", err)
|
||||
}
|
||||
|
||||
// Enqueue build ID to Redis for workers to pick up.
|
||||
if err := s.Redis.RPush(ctx, buildQueueKey, buildID).Err(); err != nil {
|
||||
return db.TemplateBuild{}, fmt.Errorf("enqueue build: %w", err)
|
||||
}
|
||||
|
||||
return build, nil
|
||||
}
|
||||
|
||||
// Get returns a single build by ID.
|
||||
func (s *BuildService) Get(ctx context.Context, buildID string) (db.TemplateBuild, error) {
|
||||
return s.DB.GetTemplateBuild(ctx, buildID)
|
||||
}
|
||||
|
||||
// List returns all builds ordered by creation time.
|
||||
func (s *BuildService) List(ctx context.Context) ([]db.TemplateBuild, error) {
|
||||
return s.DB.ListTemplateBuilds(ctx)
|
||||
}
|
||||
|
||||
// StartWorkers launches n goroutines that consume from the Redis build queue.
|
||||
// The returned cancel function stops all workers.
|
||||
func (s *BuildService) StartWorkers(ctx context.Context, n int) context.CancelFunc {
|
||||
ctx, cancel := context.WithCancel(ctx)
|
||||
for i := range n {
|
||||
go s.worker(ctx, i)
|
||||
}
|
||||
slog.Info("build workers started", "count", n)
|
||||
return cancel
|
||||
}
|
||||
|
||||
func (s *BuildService) worker(ctx context.Context, workerID int) {
|
||||
log := slog.With("worker", workerID)
|
||||
for {
|
||||
// BLPOP blocks until a build ID is available or context is cancelled.
|
||||
result, err := s.Redis.BLPop(ctx, 0, buildQueueKey).Result()
|
||||
if err != nil {
|
||||
if ctx.Err() != nil {
|
||||
log.Info("build worker shutting down")
|
||||
return
|
||||
}
|
||||
log.Error("redis BLPOP error", "error", err)
|
||||
time.Sleep(time.Second)
|
||||
continue
|
||||
}
|
||||
// result[0] is the key, result[1] is the build ID.
|
||||
buildID := result[1]
|
||||
log.Info("picked up build", "build_id", buildID)
|
||||
s.executeBuild(ctx, buildID)
|
||||
}
|
||||
}
|
||||
|
||||
func (s *BuildService) executeBuild(ctx context.Context, buildID string) {
|
||||
log := slog.With("build_id", buildID)
|
||||
|
||||
build, err := s.DB.GetTemplateBuild(ctx, buildID)
|
||||
if err != nil {
|
||||
log.Error("failed to fetch build", "error", err)
|
||||
return
|
||||
}
|
||||
|
||||
// Mark as running.
|
||||
if _, err := s.DB.UpdateBuildStatus(ctx, db.UpdateBuildStatusParams{
|
||||
ID: buildID, Status: "running",
|
||||
}); err != nil {
|
||||
log.Error("failed to update build status", "error", err)
|
||||
return
|
||||
}
|
||||
|
||||
// Parse recipe.
|
||||
var recipe []string
|
||||
if err := json.Unmarshal(build.Recipe, &recipe); err != nil {
|
||||
s.failBuild(ctx, buildID, fmt.Sprintf("invalid recipe JSON: %v", err))
|
||||
return
|
||||
}
|
||||
|
||||
// Pick a platform host and create a sandbox.
|
||||
host, err := s.Scheduler.SelectHost(ctx, platformTeamID, false)
|
||||
if err != nil {
|
||||
s.failBuild(ctx, buildID, fmt.Sprintf("no host available: %v", err))
|
||||
return
|
||||
}
|
||||
|
||||
agent, err := s.Pool.GetForHost(host)
|
||||
if err != nil {
|
||||
s.failBuild(ctx, buildID, fmt.Sprintf("agent client error: %v", err))
|
||||
return
|
||||
}
|
||||
|
||||
sandboxID := id.NewSandboxID()
|
||||
log = log.With("sandbox_id", sandboxID, "host_id", host.ID)
|
||||
|
||||
resp, err := agent.CreateSandbox(ctx, connect.NewRequest(&pb.CreateSandboxRequest{
|
||||
SandboxId: sandboxID,
|
||||
Template: build.BaseTemplate,
|
||||
Vcpus: build.Vcpus,
|
||||
MemoryMb: build.MemoryMb,
|
||||
TimeoutSec: 0, // no auto-pause for builds
|
||||
}))
|
||||
if err != nil {
|
||||
s.failBuild(ctx, buildID, fmt.Sprintf("create sandbox failed: %v", err))
|
||||
return
|
||||
}
|
||||
_ = resp
|
||||
|
||||
// Record sandbox/host association.
|
||||
_ = s.DB.UpdateBuildSandbox(ctx, db.UpdateBuildSandboxParams{
|
||||
ID: buildID,
|
||||
SandboxID: pgtype.Text{String: sandboxID, Valid: true},
|
||||
HostID: pgtype.Text{String: host.ID, Valid: true},
|
||||
})
|
||||
|
||||
// Execute recipe commands.
|
||||
var logs []BuildLogEntry
|
||||
for i, cmd := range recipe {
|
||||
log.Info("executing build step", "step", i+1, "cmd", cmd)
|
||||
|
||||
execCtx, cancel := context.WithTimeout(ctx, buildCommandTimeout)
|
||||
start := time.Now()
|
||||
|
||||
execResp, err := agent.Exec(execCtx, connect.NewRequest(&pb.ExecRequest{
|
||||
SandboxId: sandboxID,
|
||||
Cmd: "/bin/sh",
|
||||
Args: []string{"-c", cmd},
|
||||
TimeoutSec: int32(buildCommandTimeout.Seconds()),
|
||||
}))
|
||||
cancel()
|
||||
|
||||
entry := BuildLogEntry{
|
||||
Step: i + 1,
|
||||
Cmd: cmd,
|
||||
Elapsed: time.Since(start).Milliseconds(),
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
entry.Stderr = err.Error()
|
||||
entry.Ok = false
|
||||
logs = append(logs, entry)
|
||||
s.updateLogs(ctx, buildID, i+1, logs)
|
||||
s.destroySandbox(ctx, agent, sandboxID)
|
||||
s.failBuild(ctx, buildID, fmt.Sprintf("step %d exec error: %v", i+1, err))
|
||||
return
|
||||
}
|
||||
|
||||
entry.Stdout = string(execResp.Msg.Stdout)
|
||||
entry.Stderr = string(execResp.Msg.Stderr)
|
||||
entry.Exit = execResp.Msg.ExitCode
|
||||
entry.Ok = execResp.Msg.ExitCode == 0
|
||||
logs = append(logs, entry)
|
||||
|
||||
s.updateLogs(ctx, buildID, i+1, logs)
|
||||
|
||||
if execResp.Msg.ExitCode != 0 {
|
||||
s.destroySandbox(ctx, agent, sandboxID)
|
||||
s.failBuild(ctx, buildID, fmt.Sprintf("step %d failed with exit code %d", i+1, execResp.Msg.ExitCode))
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// Healthcheck or direct snapshot.
|
||||
if build.Healthcheck.Valid && build.Healthcheck.String != "" {
|
||||
log.Info("running healthcheck", "cmd", build.Healthcheck.String)
|
||||
if err := s.waitForHealthcheck(ctx, agent, sandboxID, build.Healthcheck.String); err != nil {
|
||||
s.destroySandbox(ctx, agent, sandboxID)
|
||||
s.failBuild(ctx, buildID, fmt.Sprintf("healthcheck failed: %v", err))
|
||||
return
|
||||
}
|
||||
|
||||
// Healthcheck passed → full snapshot (with memory/CPU state).
|
||||
log.Info("healthcheck passed, creating snapshot")
|
||||
if _, err := agent.CreateSnapshot(ctx, connect.NewRequest(&pb.CreateSnapshotRequest{
|
||||
SandboxId: sandboxID,
|
||||
Name: build.Name,
|
||||
})); err != nil {
|
||||
s.destroySandbox(ctx, agent, sandboxID)
|
||||
s.failBuild(ctx, buildID, fmt.Sprintf("create snapshot failed: %v", err))
|
||||
return
|
||||
}
|
||||
} else {
|
||||
// No healthcheck → image-only template (rootfs only).
|
||||
log.Info("no healthcheck, flattening rootfs")
|
||||
if _, err := agent.FlattenRootfs(ctx, connect.NewRequest(&pb.FlattenRootfsRequest{
|
||||
SandboxId: sandboxID,
|
||||
Name: build.Name,
|
||||
})); err != nil {
|
||||
s.destroySandbox(ctx, agent, sandboxID)
|
||||
s.failBuild(ctx, buildID, fmt.Sprintf("flatten rootfs failed: %v", err))
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// Insert into templates table as a global (platform) template.
|
||||
templateType := "base"
|
||||
if build.Healthcheck.Valid && build.Healthcheck.String != "" {
|
||||
templateType = "snapshot"
|
||||
}
|
||||
|
||||
if _, err := s.DB.InsertTemplate(ctx, db.InsertTemplateParams{
|
||||
Name: build.Name,
|
||||
Type: templateType,
|
||||
Vcpus: pgtype.Int4{Int32: build.Vcpus, Valid: true},
|
||||
MemoryMb: pgtype.Int4{Int32: build.MemoryMb, Valid: true},
|
||||
SizeBytes: 0, // Could query the host, but the template is created.
|
||||
TeamID: platformTeamID,
|
||||
}); err != nil {
|
||||
log.Error("failed to insert template record", "error", err)
|
||||
// Build succeeded on disk, just DB record failed — don't mark as failed.
|
||||
}
|
||||
|
||||
// For CreateSnapshot, the sandbox is already destroyed by the snapshot process.
|
||||
// For FlattenRootfs, the sandbox is already destroyed by the flatten process.
|
||||
// No additional destroy needed.
|
||||
|
||||
// Mark build as success.
|
||||
if _, err := s.DB.UpdateBuildStatus(ctx, db.UpdateBuildStatusParams{
|
||||
ID: buildID, Status: "success",
|
||||
}); err != nil {
|
||||
log.Error("failed to mark build as success", "error", err)
|
||||
}
|
||||
|
||||
log.Info("template build completed successfully", "name", build.Name)
|
||||
}
|
||||
|
||||
func (s *BuildService) waitForHealthcheck(ctx context.Context, agent buildAgentClient, sandboxID, cmd string) error {
|
||||
deadline := time.After(healthcheckTimeout)
|
||||
ticker := time.NewTicker(healthcheckInterval)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case <-deadline:
|
||||
return fmt.Errorf("healthcheck timed out after %s", healthcheckTimeout)
|
||||
case <-ticker.C:
|
||||
execCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
|
||||
resp, err := agent.Exec(execCtx, connect.NewRequest(&pb.ExecRequest{
|
||||
SandboxId: sandboxID,
|
||||
Cmd: "/bin/sh",
|
||||
Args: []string{"-c", cmd},
|
||||
TimeoutSec: 10,
|
||||
}))
|
||||
cancel()
|
||||
|
||||
if err != nil {
|
||||
slog.Debug("healthcheck exec error (retrying)", "error", err)
|
||||
continue
|
||||
}
|
||||
if resp.Msg.ExitCode == 0 {
|
||||
return nil
|
||||
}
|
||||
slog.Debug("healthcheck failed (retrying)", "exit_code", resp.Msg.ExitCode)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (s *BuildService) updateLogs(ctx context.Context, buildID string, step int, logs []BuildLogEntry) {
|
||||
logsJSON, err := json.Marshal(logs)
|
||||
if err != nil {
|
||||
slog.Warn("failed to marshal build logs", "error", err)
|
||||
return
|
||||
}
|
||||
if err := s.DB.UpdateBuildProgress(ctx, db.UpdateBuildProgressParams{
|
||||
ID: buildID,
|
||||
CurrentStep: int32(step),
|
||||
Logs: logsJSON,
|
||||
}); err != nil {
|
||||
slog.Warn("failed to update build progress", "error", err)
|
||||
}
|
||||
}
|
||||
|
||||
func (s *BuildService) failBuild(ctx context.Context, buildID, errMsg string) {
|
||||
slog.Error("build failed", "build_id", buildID, "error", errMsg)
|
||||
if err := s.DB.UpdateBuildError(ctx, db.UpdateBuildErrorParams{
|
||||
ID: buildID,
|
||||
Error: pgtype.Text{String: errMsg, Valid: true},
|
||||
}); err != nil {
|
||||
slog.Error("failed to update build error", "build_id", buildID, "error", err)
|
||||
}
|
||||
}
|
||||
|
||||
func (s *BuildService) destroySandbox(ctx context.Context, agent buildAgentClient, sandboxID string) {
|
||||
if _, err := agent.DestroySandbox(ctx, connect.NewRequest(&pb.DestroySandboxRequest{
|
||||
SandboxId: sandboxID,
|
||||
})); err != nil {
|
||||
slog.Warn("failed to destroy build sandbox", "sandbox_id", sandboxID, "error", err)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user