Add skip_pre_post build option, cancel endpoint, and recipe package

- skip_pre_post flag on builds bypasses apt update/clean pre/post steps for faster iteration when the recipe handles its own environment setup - POST /v1/admin/builds/{id}/cancel endpoint marks an in-progress build as cancelled; UpdateBuildStatus now also sets completed_at for 'cancelled' - internal/recipe: typed recipe parser and executor (RUN/ENV/COPY steps) replacing the raw string slice approach in the build worker - pre/post build commands prefixed with RUN to match recipe step format
2026-03-30 21:24:52 +06:00
parent 25ce0729d5
commit 948db13bed
12 changed files with 981 additions and 134 deletions
--- a/internal/api/handlers_builds.go
+++ b/internal/api/handlers_builds.go
@ -36,6 +36,7 @@ type createBuildRequest struct {
 	Healthcheck  string   `json:"healthcheck"`
 	VCPUs        int32    `json:"vcpus"`
 	MemoryMB     int32    `json:"memory_mb"`
+	SkipPrePost  bool     `json:"skip_pre_post"`
 }

 type buildResponse struct {
@ -127,6 +128,7 @@ func (h *buildHandler) Create(w http.ResponseWriter, r *http.Request) {
 		Healthcheck:  req.Healthcheck,
 		VCPUs:        req.VCPUs,
 		MemoryMB:     req.MemoryMB,
+		SkipPrePost:  req.SkipPrePost,
 	})
 	if err != nil {
 		slog.Error("failed to create build", "error", err)
@ -254,3 +256,21 @@ func (h *buildHandler) DeleteTemplate(w http.ResponseWriter, r *http.Request) {

 	w.WriteHeader(http.StatusNoContent)
 }
+
+// Cancel handles POST /v1/admin/builds/{id}/cancel.
+func (h *buildHandler) Cancel(w http.ResponseWriter, r *http.Request) {
+	buildIDStr := chi.URLParam(r, "id")
+
+	buildID, err := id.ParseBuildID(buildIDStr)
+	if err != nil {
+		writeError(w, http.StatusBadRequest, "invalid_request", "invalid build ID")
+		return
+	}
+
+	if err := h.svc.Cancel(r.Context(), buildID); err != nil {
+		writeError(w, http.StatusBadRequest, "invalid_request", err.Error())
+		return
+	}
+
+	w.WriteHeader(http.StatusNoContent)
+}
--- a/internal/db/template_builds.sql.go
+++ b/internal/db/template_builds.sql.go
@ -12,7 +12,7 @@ import (
 )

 const getTemplateBuild = `-- name: GetTemplateBuild :one
-SELECT id, name, base_template, recipe, healthcheck, vcpus, memory_mb, status, current_step, total_steps, logs, error, sandbox_id, host_id, created_at, started_at, completed_at, template_id, team_id FROM template_builds WHERE id = $1
+SELECT id, name, base_template, recipe, healthcheck, vcpus, memory_mb, status, current_step, total_steps, logs, error, sandbox_id, host_id, created_at, started_at, completed_at, template_id, team_id, skip_pre_post FROM template_builds WHERE id = $1
 `

 func (q *Queries) GetTemplateBuild(ctx context.Context, id pgtype.UUID) (TemplateBuild, error) {
@ -38,14 +38,15 @@ func (q *Queries) GetTemplateBuild(ctx context.Context, id pgtype.UUID) (Templat
 		&i.CompletedAt,
 		&i.TemplateID,
 		&i.TeamID,
+		&i.SkipPrePost,
 	)
 	return i, err
 }

 const insertTemplateBuild = `-- name: InsertTemplateBuild :one
-INSERT INTO template_builds (id, name, base_template, recipe, healthcheck, vcpus, memory_mb, status, total_steps, template_id, team_id)
-VALUES ($1, $2, $3, $4, $5, $6, $7, 'pending', $8, $9, $10)
-RETURNING id, name, base_template, recipe, healthcheck, vcpus, memory_mb, status, current_step, total_steps, logs, error, sandbox_id, host_id, created_at, started_at, completed_at, template_id, team_id
+INSERT INTO template_builds (id, name, base_template, recipe, healthcheck, vcpus, memory_mb, status, total_steps, template_id, team_id, skip_pre_post)
+VALUES ($1, $2, $3, $4, $5, $6, $7, 'pending', $8, $9, $10, $11)
+RETURNING id, name, base_template, recipe, healthcheck, vcpus, memory_mb, status, current_step, total_steps, logs, error, sandbox_id, host_id, created_at, started_at, completed_at, template_id, team_id, skip_pre_post
 `

 type InsertTemplateBuildParams struct {
@ -59,6 +60,7 @@ type InsertTemplateBuildParams struct {
 	TotalSteps   int32       `json:"total_steps"`
 	TemplateID   pgtype.UUID `json:"template_id"`
 	TeamID       pgtype.UUID `json:"team_id"`
+	SkipPrePost  bool        `json:"skip_pre_post"`
 }

 func (q *Queries) InsertTemplateBuild(ctx context.Context, arg InsertTemplateBuildParams) (TemplateBuild, error) {
@ -73,6 +75,7 @@ func (q *Queries) InsertTemplateBuild(ctx context.Context, arg InsertTemplateBui
 		arg.TotalSteps,
 		arg.TemplateID,
 		arg.TeamID,
+		arg.SkipPrePost,
 	)
 	var i TemplateBuild
 	err := row.Scan(
@ -95,12 +98,13 @@ func (q *Queries) InsertTemplateBuild(ctx context.Context, arg InsertTemplateBui
 		&i.CompletedAt,
 		&i.TemplateID,
 		&i.TeamID,
+		&i.SkipPrePost,
 	)
 	return i, err
 }

 const listTemplateBuilds = `-- name: ListTemplateBuilds :many
-SELECT id, name, base_template, recipe, healthcheck, vcpus, memory_mb, status, current_step, total_steps, logs, error, sandbox_id, host_id, created_at, started_at, completed_at, template_id, team_id FROM template_builds ORDER BY created_at DESC
+SELECT id, name, base_template, recipe, healthcheck, vcpus, memory_mb, status, current_step, total_steps, logs, error, sandbox_id, host_id, created_at, started_at, completed_at, template_id, team_id, skip_pre_post FROM template_builds ORDER BY created_at DESC
 `

 func (q *Queries) ListTemplateBuilds(ctx context.Context) ([]TemplateBuild, error) {
@ -132,6 +136,7 @@ func (q *Queries) ListTemplateBuilds(ctx context.Context) ([]TemplateBuild, erro
 			&i.CompletedAt,
 			&i.TemplateID,
 			&i.TeamID,
+			&i.SkipPrePost,
 		); err != nil {
 			return nil, err
 		}
@ -196,10 +201,10 @@ func (q *Queries) UpdateBuildSandbox(ctx context.Context, arg UpdateBuildSandbox
 const updateBuildStatus = `-- name: UpdateBuildStatus :one
 UPDATE template_builds
 SET status = $2,
-    started_at = CASE WHEN $2 = 'running' AND started_at IS NULL THEN NOW() ELSE started_at END,
-    completed_at = CASE WHEN $2 IN ('success', 'failed') THEN NOW() ELSE completed_at END
+    started_at   = CASE WHEN $2 = 'running'   AND started_at   IS NULL THEN NOW() ELSE started_at   END,
+    completed_at = CASE WHEN $2 IN ('success', 'failed', 'cancelled') THEN NOW() ELSE completed_at END
 WHERE id = $1
-RETURNING id, name, base_template, recipe, healthcheck, vcpus, memory_mb, status, current_step, total_steps, logs, error, sandbox_id, host_id, created_at, started_at, completed_at, template_id, team_id
+RETURNING id, name, base_template, recipe, healthcheck, vcpus, memory_mb, status, current_step, total_steps, logs, error, sandbox_id, host_id, created_at, started_at, completed_at, template_id, team_id, skip_pre_post
 `

 type UpdateBuildStatusParams struct {
@ -230,6 +235,7 @@ func (q *Queries) UpdateBuildStatus(ctx context.Context, arg UpdateBuildStatusPa
 		&i.CompletedAt,
 		&i.TemplateID,
 		&i.TeamID,
+		&i.SkipPrePost,
 	)
 	return i, err
 }
--- a/internal/recipe/context.go
+++ b/internal/recipe/context.go
@ -0,0 +1,63 @@
+package recipe
+
+import "strings"
+
+// ExecContext holds mutable state that persists across recipe steps.
+// It is initialized empty and updated by ENV and WORKDIR steps.
+type ExecContext struct {
+	WorkDir string
+	EnvVars map[string]string
+}
+
+// WrappedCommand returns the full shell command for a RUN step with context
+// applied. The result is passed as the argument to /bin/sh -c.
+//
+// If WORKDIR and/or ENV are set, they are prepended as a shell preamble:
+//
+//	cd '/the/dir' && KEY='val' /bin/sh -c 'original command'
+func (c *ExecContext) WrappedCommand(cmd string) string {
+	prefix := c.shellPrefix()
+	if prefix == "" {
+		return cmd
+	}
+	return prefix + "/bin/sh -c " + shellescape(cmd)
+}
+
+// StartCommand returns the shell command for a START step. The process is
+// launched in the background via nohup so that the outer shell exits
+// immediately, allowing the build to continue. stdout/stderr of the
+// background process are discarded (the process keeps running in the VM).
+//
+// Multiple START steps can be issued to run several background processes
+// simultaneously before a healthcheck is evaluated.
+func (c *ExecContext) StartCommand(cmd string) string {
+	prefix := c.shellPrefix()
+	return prefix + "nohup /bin/sh -c " + shellescape(cmd) + " >/dev/null 2>&1 &"
+}
+
+// shellPrefix builds the "cd ... && KEY=val " preamble for a shell command.
+// Returns an empty string when no context is set.
+func (c *ExecContext) shellPrefix() string {
+	if c.WorkDir == "" && len(c.EnvVars) == 0 {
+		return ""
+	}
+	var sb strings.Builder
+	if c.WorkDir != "" {
+		sb.WriteString("cd ")
+		sb.WriteString(shellescape(c.WorkDir))
+		sb.WriteString(" && ")
+	}
+	for k, v := range c.EnvVars {
+		sb.WriteString(k)
+		sb.WriteByte('=')
+		sb.WriteString(shellescape(v))
+		sb.WriteByte(' ')
+	}
+	return sb.String()
+}
+
+// shellescape wraps s in single quotes, escaping any embedded single quotes.
+// This is POSIX-safe for paths, env values, and shell commands.
+func shellescape(s string) string {
+	return "'" + strings.ReplaceAll(s, "'", `'\''`) + "'"
+}
--- a/internal/recipe/context_test.go
+++ b/internal/recipe/context_test.go
@ -0,0 +1,114 @@
+package recipe
+
+import "testing"
+
+func TestExecContext_WrappedCommand(t *testing.T) {
+	tests := []struct {
+		name string
+		ctx  ExecContext
+		cmd  string
+		want string
+	}{
+		{
+			name: "no context",
+			ctx:  ExecContext{},
+			cmd:  "apt install -y curl",
+			want: "apt install -y curl",
+		},
+		{
+			name: "workdir only",
+			ctx:  ExecContext{WorkDir: "/app"},
+			cmd:  "npm install",
+			want: "cd '/app' && /bin/sh -c 'npm install'",
+		},
+		{
+			name: "env only",
+			ctx:  ExecContext{EnvVars: map[string]string{"PORT": "8080"}},
+			cmd:  "node server.js",
+			want: "PORT='8080' /bin/sh -c 'node server.js'",
+		},
+		{
+			name: "workdir with space",
+			ctx:  ExecContext{WorkDir: "/my project"},
+			cmd:  "make build",
+			want: "cd '/my project' && /bin/sh -c 'make build'",
+		},
+		{
+			name: "command with single quotes",
+			ctx:  ExecContext{WorkDir: "/app"},
+			cmd:  "echo 'hello'",
+			want: "cd '/app' && /bin/sh -c 'echo '\\''hello'\\'''",
+		},
+		{
+			name: "env value with single quotes",
+			ctx:  ExecContext{EnvVars: map[string]string{"MSG": "it's fine"}},
+			cmd:  "echo $MSG",
+			want: "MSG='it'\\''s fine' /bin/sh -c 'echo $MSG'",
+		},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			got := tc.ctx.WrappedCommand(tc.cmd)
+			if got != tc.want {
+				t.Errorf("WrappedCommand(%q)\n  got  %q\n  want %q", tc.cmd, got, tc.want)
+			}
+		})
+	}
+}
+
+func TestExecContext_StartCommand(t *testing.T) {
+	tests := []struct {
+		name string
+		ctx  ExecContext
+		cmd  string
+		want string
+	}{
+		{
+			name: "no context",
+			ctx:  ExecContext{},
+			cmd:  "python3 app.py",
+			want: "nohup /bin/sh -c 'python3 app.py' >/dev/null 2>&1 &",
+		},
+		{
+			name: "with workdir",
+			ctx:  ExecContext{WorkDir: "/app"},
+			cmd:  "python3 server.py",
+			want: "cd '/app' && nohup /bin/sh -c 'python3 server.py' >/dev/null 2>&1 &",
+		},
+		{
+			name: "with env",
+			ctx:  ExecContext{EnvVars: map[string]string{"PORT": "9000"}},
+			cmd:  "node index.js",
+			want: "PORT='9000' nohup /bin/sh -c 'node index.js' >/dev/null 2>&1 &",
+		},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			got := tc.ctx.StartCommand(tc.cmd)
+			if got != tc.want {
+				t.Errorf("StartCommand(%q)\n  got  %q\n  want %q", tc.cmd, got, tc.want)
+			}
+		})
+	}
+}
+
+func TestShellescape(t *testing.T) {
+	tests := []struct {
+		input string
+		want  string
+	}{
+		{"simple", "'simple'"},
+		{"/path/to/dir", "'/path/to/dir'"},
+		{"it's fine", "'it'\\''s fine'"},
+		{"", "''"},
+		{"a'b'c", "'a'\\''b'\\''c'"},
+	}
+	for _, tc := range tests {
+		got := shellescape(tc.input)
+		if got != tc.want {
+			t.Errorf("shellescape(%q) = %q, want %q", tc.input, got, tc.want)
+		}
+	}
+}
--- a/internal/recipe/executor.go
+++ b/internal/recipe/executor.go
@ -0,0 +1,185 @@
+package recipe
+
+import (
+	"context"
+	"fmt"
+	"log/slog"
+	"strings"
+	"time"
+
+	"connectrpc.com/connect"
+
+	pb "git.omukk.dev/wrenn/sandbox/proto/hostagent/gen"
+)
+
+// DefaultStepTimeout is the fallback timeout for RUN steps that carry no
+// explicit --timeout flag.
+const DefaultStepTimeout = 30 * time.Second
+
+// BuildLogEntry is the per-step record stored in template_builds.logs (JSONB).
+type BuildLogEntry struct {
+	Step    int    `json:"step"`
+	Phase   string `json:"phase"`
+	Cmd     string `json:"cmd"`
+	Stdout  string `json:"stdout"`
+	Stderr  string `json:"stderr"`
+	Exit    int32  `json:"exit"`
+	Ok      bool   `json:"ok"`
+	Elapsed int64  `json:"elapsed_ms"`
+}
+
+// ExecFunc is the agent.Exec call signature used by the executor. It matches
+// the method on the hostagent Connect RPC client.
+type ExecFunc func(ctx context.Context, req *connect.Request[pb.ExecRequest]) (*connect.Response[pb.ExecResponse], error)
+
+// Execute runs steps sequentially against sandboxID using execFn.
+//
+//   - phase labels the log entries (e.g., "pre-build", "recipe", "post-build").
+//   - startStep is the 1-based offset so entries are globally numbered across phases.
+//   - defaultTimeout applies to RUN steps with no per-step --timeout; 0 → 10 minutes.
+//   - bctx is mutated in place as ENV/WORKDIR steps execute, and carries forward
+//     into subsequent phases when the caller passes the same pointer.
+//
+// Returns all log entries appended during this call, the next step counter
+// value, and whether all steps succeeded. On false the last entry contains
+// failure details; the caller is responsible for destroying the sandbox and
+// recording the build error.
+func Execute(
+	ctx context.Context,
+	phase string,
+	steps []Step,
+	sandboxID string,
+	startStep int,
+	defaultTimeout time.Duration,
+	bctx *ExecContext,
+	execFn ExecFunc,
+) (entries []BuildLogEntry, nextStep int, ok bool) {
+	if defaultTimeout <= 0 {
+		defaultTimeout = 10 * time.Minute
+	}
+
+	step := startStep
+	for _, st := range steps {
+		step++
+		slog.Info("executing build step", "phase", phase, "step", step, "instruction", st.Raw)
+
+		switch st.Kind {
+		case KindENV:
+			if bctx.EnvVars == nil {
+				bctx.EnvVars = make(map[string]string)
+			}
+			bctx.EnvVars[st.Key] = st.Value
+			entries = append(entries, BuildLogEntry{Step: step, Phase: phase, Cmd: st.Raw, Ok: true})
+
+		case KindWORKDIR:
+			bctx.WorkDir = st.Path
+			entries = append(entries, BuildLogEntry{Step: step, Phase: phase, Cmd: st.Raw, Ok: true})
+
+		case KindUSER, KindCOPY:
+			verb := strings.ToUpper(strings.Fields(st.Raw)[0])
+			entries = append(entries, BuildLogEntry{
+				Step:   step,
+				Phase:  phase,
+				Cmd:    st.Raw,
+				Stderr: verb + " is not yet supported",
+				Ok:     false,
+			})
+			return entries, step, false
+
+		case KindSTART:
+			entry, succeeded := execStart(ctx, st, sandboxID, phase, step, bctx, execFn)
+			entries = append(entries, entry)
+			if !succeeded {
+				return entries, step, false
+			}
+
+		case KindRUN:
+			timeout := defaultTimeout
+			if st.Timeout > 0 {
+				timeout = st.Timeout
+			}
+			entry, succeeded := execRun(ctx, st, sandboxID, phase, step, timeout, bctx, execFn)
+			entries = append(entries, entry)
+			if !succeeded {
+				return entries, step, false
+			}
+		}
+	}
+	return entries, step, true
+}
+
+func execRun(
+	ctx context.Context,
+	st Step,
+	sandboxID, phase string,
+	step int,
+	timeout time.Duration,
+	bctx *ExecContext,
+	execFn ExecFunc,
+) (BuildLogEntry, bool) {
+	execCtx, cancel := context.WithTimeout(ctx, timeout)
+	defer cancel()
+
+	start := time.Now()
+	resp, err := execFn(execCtx, connect.NewRequest(&pb.ExecRequest{
+		SandboxId:  sandboxID,
+		Cmd:        "/bin/sh",
+		Args:       []string{"-c", bctx.WrappedCommand(st.Shell)},
+		TimeoutSec: int32(timeout.Seconds()),
+	}))
+
+	entry := BuildLogEntry{
+		Step:    step,
+		Phase:   phase,
+		Cmd:     st.Raw,
+		Elapsed: time.Since(start).Milliseconds(),
+	}
+	if err != nil {
+		entry.Stderr = fmt.Sprintf("exec error: %v", err)
+		return entry, false
+	}
+	entry.Stdout = string(resp.Msg.Stdout)
+	entry.Stderr = string(resp.Msg.Stderr)
+	entry.Exit = resp.Msg.ExitCode
+	entry.Ok = resp.Msg.ExitCode == 0
+	return entry, entry.Ok
+}
+
+func execStart(
+	ctx context.Context,
+	st Step,
+	sandboxID, phase string,
+	step int,
+	bctx *ExecContext,
+	execFn ExecFunc,
+) (BuildLogEntry, bool) {
+	// START uses a short timeout: just long enough for the shell to fork and
+	// return. The background process itself runs indefinitely inside the VM.
+	execCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
+	defer cancel()
+
+	start := time.Now()
+	resp, err := execFn(execCtx, connect.NewRequest(&pb.ExecRequest{
+		SandboxId:  sandboxID,
+		Cmd:        "/bin/sh",
+		Args:       []string{"-c", bctx.StartCommand(st.Shell)},
+		TimeoutSec: 10,
+	}))
+
+	entry := BuildLogEntry{
+		Step:    step,
+		Phase:   phase,
+		Cmd:     st.Raw,
+		Elapsed: time.Since(start).Milliseconds(),
+	}
+	if err != nil {
+		entry.Stderr = fmt.Sprintf("start error: %v", err)
+		return entry, false
+	}
+	entry.Exit = resp.Msg.ExitCode
+	entry.Ok = resp.Msg.ExitCode == 0
+	if !entry.Ok {
+		entry.Stderr = fmt.Sprintf("start failed with exit code %d: %s", resp.Msg.ExitCode, string(resp.Msg.Stderr))
+	}
+	return entry, entry.Ok
+}
--- a/internal/recipe/step.go
+++ b/internal/recipe/step.go
@ -0,0 +1,129 @@
+package recipe
+
+import (
+	"fmt"
+	"strings"
+	"time"
+)
+
+// Kind identifies the instruction type in a recipe line.
+type Kind int
+
+const (
+	KindRUN     Kind = iota // Execute a command and wait for it to exit.
+	KindSTART               // Start a command in the background (non-blocking).
+	KindENV                 // Set an environment variable for subsequent steps.
+	KindWORKDIR             // Set the working directory for subsequent steps.
+	KindUSER                // Switch the unix user for subsequent steps. (stub)
+	KindCOPY                // Copy files into the sandbox. (stub)
+)
+
+// Step is the parsed representation of one recipe instruction.
+type Step struct {
+	Kind    Kind
+	Raw     string        // original string, preserved for logging
+	Shell   string        // KindRUN, KindSTART: the shell command text
+	Timeout time.Duration // KindRUN: 0 means use caller's default
+	Key     string        // KindENV: variable name
+	Value   string        // KindENV: variable value
+	Path    string        // KindWORKDIR: directory path
+}
+
+// ParseStep parses a single recipe instruction string into a Step.
+// Instructions are Dockerfile-like: a keyword followed by arguments.
+//
+// Supported syntax:
+//
+//	RUN <cmd>                 — run command, wait for exit
+//	RUN --timeout=<d> <cmd>  — run command with explicit timeout (e.g. --timeout=5m)
+//	START <cmd>               — start command in background, return immediately
+//	ENV <key>=<value>         — set environment variable
+//	WORKDIR <path>            — set working directory
+//	USER <name>               — not yet supported
+//	COPY <src> <dst>          — not yet supported
+func ParseStep(s string) (Step, error) {
+	s = strings.TrimSpace(s)
+	if s == "" {
+		return Step{}, fmt.Errorf("empty step")
+	}
+
+	// Split on first space to get the keyword.
+	keyword, rest, _ := strings.Cut(s, " ")
+	rest = strings.TrimSpace(rest)
+
+	switch strings.ToUpper(keyword) {
+	case "RUN":
+		return parseRUN(s, rest)
+	case "START":
+		return parseSTART(s, rest)
+	case "ENV":
+		return parseENV(s, rest)
+	case "WORKDIR":
+		return parseWORKDIR(s, rest)
+	case "USER":
+		return Step{Kind: KindUSER, Raw: s}, nil
+	case "COPY":
+		return Step{Kind: KindCOPY, Raw: s}, nil
+	default:
+		return Step{}, fmt.Errorf("unknown instruction %q (expected RUN, START, ENV, WORKDIR, USER, or COPY)", keyword)
+	}
+}
+
+// ParseRecipe parses all recipe lines, returning on the first error.
+func ParseRecipe(lines []string) ([]Step, error) {
+	steps := make([]Step, 0, len(lines))
+	for i, line := range lines {
+		st, err := ParseStep(line)
+		if err != nil {
+			return nil, fmt.Errorf("recipe line %d: %w", i+1, err)
+		}
+		steps = append(steps, st)
+	}
+	return steps, nil
+}
+
+func parseRUN(raw, rest string) (Step, error) {
+	var timeout time.Duration
+	if strings.HasPrefix(rest, "--timeout=") {
+		rest = rest[len("--timeout="):]
+		flag, cmd, found := strings.Cut(rest, " ")
+		if !found || strings.TrimSpace(cmd) == "" {
+			return Step{}, fmt.Errorf("RUN --timeout= flag has no command: %q", raw)
+		}
+		d, err := time.ParseDuration(flag)
+		if err != nil {
+			return Step{}, fmt.Errorf("RUN --timeout= invalid duration %q: %w", flag, err)
+		}
+		timeout = d
+		rest = strings.TrimSpace(cmd)
+	}
+	if rest == "" {
+		return Step{}, fmt.Errorf("RUN requires a command: %q", raw)
+	}
+	return Step{Kind: KindRUN, Raw: raw, Shell: rest, Timeout: timeout}, nil
+}
+
+func parseSTART(raw, rest string) (Step, error) {
+	if rest == "" {
+		return Step{}, fmt.Errorf("START requires a command: %q", raw)
+	}
+	return Step{Kind: KindSTART, Raw: raw, Shell: rest}, nil
+}
+
+func parseENV(raw, rest string) (Step, error) {
+	key, value, found := strings.Cut(rest, "=")
+	if !found {
+		return Step{}, fmt.Errorf("ENV requires KEY=VALUE format: %q", raw)
+	}
+	if key == "" {
+		return Step{}, fmt.Errorf("ENV key is empty: %q", raw)
+	}
+	return Step{Kind: KindENV, Raw: raw, Key: key, Value: value}, nil
+}
+
+func parseWORKDIR(raw, path string) (Step, error) {
+	if path == "" {
+		return Step{}, fmt.Errorf("WORKDIR requires a path: %q", raw)
+	}
+	return Step{Kind: KindWORKDIR, Raw: raw, Path: path}, nil
+}
--- a/internal/recipe/step_test.go
+++ b/internal/recipe/step_test.go
@ -0,0 +1,208 @@
+package recipe
+
+import (
+	"testing"
+	"time"
+)
+
+func TestParseStep(t *testing.T) {
+	tests := []struct {
+		name    string
+		input   string
+		want    Step
+		wantErr bool
+	}{
+		// RUN
+		{
+			name:  "RUN basic",
+			input: "RUN apt install -y curl",
+			want:  Step{Kind: KindRUN, Raw: "RUN apt install -y curl", Shell: "apt install -y curl"},
+		},
+		{
+			name:  "RUN lowercase",
+			input: "run echo hello",
+			want:  Step{Kind: KindRUN, Raw: "run echo hello", Shell: "echo hello"},
+		},
+		{
+			name:  "RUN with timeout",
+			input: "RUN --timeout=5m npm install",
+			want:  Step{Kind: KindRUN, Raw: "RUN --timeout=5m npm install", Shell: "npm install", Timeout: 5 * time.Minute},
+		},
+		{
+			name:  "RUN with timeout seconds",
+			input: "RUN --timeout=30s make build",
+			want:  Step{Kind: KindRUN, Raw: "RUN --timeout=30s make build", Shell: "make build", Timeout: 30 * time.Second},
+		},
+		{
+			name:    "RUN no command",
+			input:   "RUN",
+			wantErr: true,
+		},
+		{
+			name:    "RUN timeout no command",
+			input:   "RUN --timeout=5m",
+			wantErr: true,
+		},
+		{
+			name:    "RUN invalid timeout",
+			input:   "RUN --timeout=notaduration echo hi",
+			wantErr: true,
+		},
+		// START
+		{
+			name:  "START basic",
+			input: "START python3 app.py",
+			want:  Step{Kind: KindSTART, Raw: "START python3 app.py", Shell: "python3 app.py"},
+		},
+		{
+			name:  "START uppercase",
+			input: "START node server.js --port=8080",
+			want:  Step{Kind: KindSTART, Raw: "START node server.js --port=8080", Shell: "node server.js --port=8080"},
+		},
+		{
+			name:    "START no command",
+			input:   "START",
+			wantErr: true,
+		},
+		// ENV
+		{
+			name:  "ENV basic",
+			input: "ENV FOO=bar",
+			want:  Step{Kind: KindENV, Raw: "ENV FOO=bar", Key: "FOO", Value: "bar"},
+		},
+		{
+			name:  "ENV value with spaces",
+			input: "ENV GREETING=hello world",
+			want:  Step{Kind: KindENV, Raw: "ENV GREETING=hello world", Key: "GREETING", Value: "hello world"},
+		},
+		{
+			name:  "ENV value with equals sign",
+			input: "ENV URL=http://example.com?a=1",
+			want:  Step{Kind: KindENV, Raw: "ENV URL=http://example.com?a=1", Key: "URL", Value: "http://example.com?a=1"},
+		},
+		{
+			name:  "ENV empty value",
+			input: "ENV FOO=",
+			want:  Step{Kind: KindENV, Raw: "ENV FOO=", Key: "FOO", Value: ""},
+		},
+		{
+			name:    "ENV missing equals",
+			input:   "ENV FOO",
+			wantErr: true,
+		},
+		{
+			name:    "ENV empty key",
+			input:   "ENV =value",
+			wantErr: true,
+		},
+		// WORKDIR
+		{
+			name:  "WORKDIR basic",
+			input: "WORKDIR /app",
+			want:  Step{Kind: KindWORKDIR, Raw: "WORKDIR /app", Path: "/app"},
+		},
+		{
+			name:  "WORKDIR with spaces in path",
+			input: "WORKDIR /my project",
+			want:  Step{Kind: KindWORKDIR, Raw: "WORKDIR /my project", Path: "/my project"},
+		},
+		{
+			name:    "WORKDIR empty",
+			input:   "WORKDIR",
+			wantErr: true,
+		},
+		// USER and COPY stubs
+		{
+			name:  "USER stub",
+			input: "USER www-data",
+			want:  Step{Kind: KindUSER, Raw: "USER www-data"},
+		},
+		{
+			name:  "COPY stub",
+			input: "COPY config.yaml /etc/app/config.yaml",
+			want:  Step{Kind: KindCOPY, Raw: "COPY config.yaml /etc/app/config.yaml"},
+		},
+		// Unknown keyword
+		{
+			name:    "unknown keyword",
+			input:   "FROBNICATE something",
+			wantErr: true,
+		},
+		// Empty input
+		{
+			name:    "empty string",
+			input:   "",
+			wantErr: true,
+		},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			got, err := ParseStep(tc.input)
+			if tc.wantErr {
+				if err == nil {
+					t.Fatalf("ParseStep(%q) expected error, got %+v", tc.input, got)
+				}
+				return
+			}
+			if err != nil {
+				t.Fatalf("ParseStep(%q) unexpected error: %v", tc.input, err)
+			}
+			if got != tc.want {
+				t.Errorf("ParseStep(%q)\n  got  %+v\n  want %+v", tc.input, got, tc.want)
+			}
+		})
+	}
+}
+
+func TestParseRecipe(t *testing.T) {
+	t.Run("valid recipe", func(t *testing.T) {
+		lines := []string{
+			"RUN apt update",
+			"WORKDIR /app",
+			"ENV PORT=8080",
+			"START python3 server.py",
+			"RUN --timeout=2m pip install -r requirements.txt",
+		}
+		steps, err := ParseRecipe(lines)
+		if err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+		if len(steps) != 5 {
+			t.Fatalf("expected 5 steps, got %d", len(steps))
+		}
+		if steps[0].Kind != KindRUN {
+			t.Errorf("step 0: want KindRUN, got %v", steps[0].Kind)
+		}
+		if steps[1].Kind != KindWORKDIR {
+			t.Errorf("step 1: want KindWORKDIR, got %v", steps[1].Kind)
+		}
+		if steps[3].Kind != KindSTART {
+			t.Errorf("step 3: want KindSTART, got %v", steps[3].Kind)
+		}
+		if steps[4].Timeout != 2*time.Minute {
+			t.Errorf("step 4: want 2m timeout, got %v", steps[4].Timeout)
+		}
+	})
+
+	t.Run("error on invalid line", func(t *testing.T) {
+		lines := []string{
+			"RUN apt update",
+			"BADCMD something",
+		}
+		_, err := ParseRecipe(lines)
+		if err == nil {
+			t.Fatal("expected error for invalid line, got nil")
+		}
+	})
+
+	t.Run("empty recipe", func(t *testing.T) {
+		steps, err := ParseRecipe(nil)
+		if err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+		if len(steps) != 0 {
+			t.Fatalf("expected 0 steps, got %d", len(steps))
+		}
+	})
+}
--- a/internal/service/build.go
+++ b/internal/service/build.go
@ -5,6 +5,7 @@ import (
 	"encoding/json"
 	"fmt"
 	"log/slog"
+	"sync"
 	"time"

 	"connectrpc.com/connect"
@ -14,6 +15,7 @@ import (
 	"git.omukk.dev/wrenn/sandbox/internal/db"
 	"git.omukk.dev/wrenn/sandbox/internal/id"
 	"git.omukk.dev/wrenn/sandbox/internal/lifecycle"
+	"git.omukk.dev/wrenn/sandbox/internal/recipe"
 	"git.omukk.dev/wrenn/sandbox/internal/scheduler"
 	pb "git.omukk.dev/wrenn/sandbox/proto/hostagent/gen"
 )
@ -27,14 +29,14 @@ const (

 // preBuildCmds run before the user recipe to prepare the build environment.
 var preBuildCmds = []string{
-	"apt update",
+	"RUN apt update",
 }

 // postBuildCmds run after the user recipe to clean up caches and reduce image size.
 var postBuildCmds = []string{
-	"apt clean",
-	"apt autoremove -y",
-	"rm -rf /var/lib/apt/lists/*",
+	"RUN apt clean",
+	"RUN apt autoremove -y",
+	"RUN rm -rf /var/lib/apt/lists/*",
 }

 // buildAgentClient is the subset of the host agent client used by the build worker.
@ -46,24 +48,15 @@ type buildAgentClient interface {
 	FlattenRootfs(ctx context.Context, req *connect.Request[pb.FlattenRootfsRequest]) (*connect.Response[pb.FlattenRootfsResponse], error)
 }

-// BuildLogEntry represents a single entry in the build log JSONB array.
-type BuildLogEntry struct {
-	Step    int    `json:"step"`
-	Phase   string `json:"phase"` // "pre-build", "recipe", or "post-build"
-	Cmd     string `json:"cmd"`
-	Stdout  string `json:"stdout"`
-	Stderr  string `json:"stderr"`
-	Exit    int32  `json:"exit"`
-	Ok      bool   `json:"ok"`
-	Elapsed int64  `json:"elapsed_ms"`
-}
-
 // BuildService handles template build orchestration.
 type BuildService struct {
 	DB        *db.Queries
 	Redis     *redis.Client
 	Pool      *lifecycle.HostClientPool
 	Scheduler scheduler.HostScheduler
+
+	mu        sync.Mutex
+	cancelMap map[string]context.CancelFunc // buildID → per-build cancel func
 }

 // BuildCreateParams holds the parameters for creating a template build.
@ -74,6 +67,7 @@ type BuildCreateParams struct {
 	Healthcheck  string
 	VCPUs        int32
 	MemoryMB     int32
+	SkipPrePost  bool
 }

 // Create inserts a new build record and enqueues it to Redis.
@ -97,6 +91,11 @@ func (s *BuildService) Create(ctx context.Context, p BuildCreateParams) (db.Temp
 	buildIDStr := id.FormatBuildID(buildID)
 	newTemplateID := id.NewTemplateID()

+	defaultSteps := len(preBuildCmds) + len(postBuildCmds)
+	if p.SkipPrePost {
+		defaultSteps = 0
+	}
+
 	build, err := s.DB.InsertTemplateBuild(ctx, db.InsertTemplateBuildParams{
 		ID:           buildID,
 		Name:         p.Name,
@ -105,9 +104,10 @@ func (s *BuildService) Create(ctx context.Context, p BuildCreateParams) (db.Temp
 		Healthcheck:  p.Healthcheck,
 		Vcpus:        p.VCPUs,
 		MemoryMb:     p.MemoryMB,
-		TotalSteps:   int32(len(p.Recipe) + len(preBuildCmds) + len(postBuildCmds)),
+		TotalSteps:   int32(len(p.Recipe) + defaultSteps),
 		TemplateID:   newTemplateID,
 		TeamID:       id.PlatformTeamID,
+		SkipPrePost:  p.SkipPrePost,
 	})
 	if err != nil {
 		return db.TemplateBuild{}, fmt.Errorf("insert build: %w", err)
@ -131,6 +131,40 @@ func (s *BuildService) List(ctx context.Context) ([]db.TemplateBuild, error) {
 	return s.DB.ListTemplateBuilds(ctx)
 }

+// Cancel cancels a pending or running build. For pending builds the status is
+// updated in the DB and the worker skips it when dequeued. For running builds
+// the per-build context is cancelled, which causes the current exec step to
+// abort; executeBuild then detects the cancellation and records the status.
+func (s *BuildService) Cancel(ctx context.Context, buildID pgtype.UUID) error {
+	build, err := s.DB.GetTemplateBuild(ctx, buildID)
+	if err != nil {
+		return fmt.Errorf("get build: %w", err)
+	}
+	switch build.Status {
+	case "success", "failed", "cancelled":
+		return fmt.Errorf("build is already %s", build.Status)
+	}
+
+	// Mark cancelled in DB first. This handles both pending builds (which haven't
+	// been picked up yet) and acts as a flag for executeBuild to check on start.
+	if _, err := s.DB.UpdateBuildStatus(ctx, db.UpdateBuildStatusParams{
+		ID: buildID, Status: "cancelled",
+	}); err != nil {
+		return fmt.Errorf("update build status: %w", err)
+	}
+
+	// If the build is currently running, signal its context.
+	buildIDStr := id.FormatBuildID(buildID)
+	s.mu.Lock()
+	cancel, running := s.cancelMap[buildIDStr]
+	s.mu.Unlock()
+	if running {
+		cancel()
+	}
+
+	return nil
+}
+
 // StartWorkers launches n goroutines that consume from the Redis build queue.
 // The returned cancel function stops all workers.
 func (s *BuildService) StartWorkers(ctx context.Context, n int) context.CancelFunc {
@ -172,14 +206,38 @@ func (s *BuildService) executeBuild(ctx context.Context, buildIDStr string) {
 		return
 	}

-	build, err := s.DB.GetTemplateBuild(ctx, buildID)
+	// Create a per-build context so this build can be cancelled independently of
+	// the worker. Register in cancelMap before fetching the build so that a
+	// concurrent Cancel call can always find and signal it.
+	buildCtx, buildCancel := context.WithCancel(ctx)
+	defer buildCancel()
+
+	s.mu.Lock()
+	if s.cancelMap == nil {
+		s.cancelMap = make(map[string]context.CancelFunc)
+	}
+	s.cancelMap[buildIDStr] = buildCancel
+	s.mu.Unlock()
+	defer func() {
+		s.mu.Lock()
+		delete(s.cancelMap, buildIDStr)
+		s.mu.Unlock()
+	}()
+
+	build, err := s.DB.GetTemplateBuild(buildCtx, buildID)
 	if err != nil {
 		log.Error("failed to fetch build", "error", err)
 		return
 	}

+	// Skip if already cancelled (Cancel was called before we dequeued).
+	if build.Status == "cancelled" {
+		log.Info("build already cancelled, skipping")
+		return
+	}
+
 	// Mark as running.
-	if _, err := s.DB.UpdateBuildStatus(ctx, db.UpdateBuildStatusParams{
+	if _, err := s.DB.UpdateBuildStatus(buildCtx, db.UpdateBuildStatusParams{
 		ID: buildID, Status: "running",
 	}); err != nil {
 		log.Error("failed to update build status", "error", err)
@ -187,22 +245,22 @@ func (s *BuildService) executeBuild(ctx context.Context, buildIDStr string) {
 	}

 	// Parse user recipe.
-	var recipe []string
-	if err := json.Unmarshal(build.Recipe, &recipe); err != nil {
-		s.failBuild(ctx, buildID, fmt.Sprintf("invalid recipe JSON: %v", err))
+	var userRecipe []string
+	if err := json.Unmarshal(build.Recipe, &userRecipe); err != nil {
+		s.failBuild(buildCtx, buildID, fmt.Sprintf("invalid recipe JSON: %v", err))
 		return
 	}

 	// Pick a platform host and create a sandbox.
-	host, err := s.Scheduler.SelectHost(ctx, id.PlatformTeamID, false)
+	host, err := s.Scheduler.SelectHost(buildCtx, id.PlatformTeamID, false)
 	if err != nil {
-		s.failBuild(ctx, buildID, fmt.Sprintf("no host available: %v", err))
+		s.failBuild(buildCtx, buildID, fmt.Sprintf("no host available: %v", err))
 		return
 	}

 	agent, err := s.Pool.GetForHost(host)
 	if err != nil {
-		s.failBuild(ctx, buildID, fmt.Sprintf("agent client error: %v", err))
+		s.failBuild(buildCtx, buildID, fmt.Sprintf("agent client error: %v", err))
 		return
 	}

@ -214,16 +272,16 @@ func (s *BuildService) executeBuild(ctx context.Context, buildIDStr string) {
 	baseTeamID := id.PlatformTeamID
 	baseTemplateID := id.MinimalTemplateID
 	if build.BaseTemplate != "minimal" {
-		baseTmpl, err := s.DB.GetPlatformTemplateByName(ctx, build.BaseTemplate)
+		baseTmpl, err := s.DB.GetPlatformTemplateByName(buildCtx, build.BaseTemplate)
 		if err != nil {
-			s.failBuild(ctx, buildID, fmt.Sprintf("base template %q not found: %v", build.BaseTemplate, err))
+			s.failBuild(buildCtx, buildID, fmt.Sprintf("base template %q not found: %v", build.BaseTemplate, err))
 			return
 		}
 		baseTeamID = baseTmpl.TeamID
 		baseTemplateID = baseTmpl.ID
 	}

-	resp, err := agent.CreateSandbox(ctx, connect.NewRequest(&pb.CreateSandboxRequest{
+	resp, err := agent.CreateSandbox(buildCtx, connect.NewRequest(&pb.CreateSandboxRequest{
 		SandboxId:  sandboxIDStr,
 		Template:   build.BaseTemplate,
 		TeamId:     id.UUIDString(baseTeamID),
@ -234,129 +292,121 @@ func (s *BuildService) executeBuild(ctx context.Context, buildIDStr string) {
 		DiskSizeMb: 5120, // 5 GB for template builds
 	}))
 	if err != nil {
-		s.failBuild(ctx, buildID, fmt.Sprintf("create sandbox failed: %v", err))
+		s.failBuild(buildCtx, buildID, fmt.Sprintf("create sandbox failed: %v", err))
 		return
 	}
 	_ = resp

 	// Record sandbox/host association.
-	_ = s.DB.UpdateBuildSandbox(ctx, db.UpdateBuildSandboxParams{
+	_ = s.DB.UpdateBuildSandbox(buildCtx, db.UpdateBuildSandboxParams{
 		ID:        buildID,
 		SandboxID: sandboxID,
 		HostID:    host.ID,
 	})

+	// Parse recipe steps. preBuildCmds and postBuildCmds are hardcoded and always
+	// valid; panic on error is appropriate here since it would be a programmer mistake.
+	preBuildSteps, err := recipe.ParseRecipe(preBuildCmds)
+	if err != nil {
+		panic(fmt.Sprintf("invalid pre-build recipe: %v", err))
+	}
+	userRecipeSteps, err := recipe.ParseRecipe(userRecipe)
+	if err != nil {
+		s.destroySandbox(buildCtx, agent, sandboxIDStr)
+		s.failBuild(buildCtx, buildID, fmt.Sprintf("recipe parse error: %v", err))
+		return
+	}
+	postBuildSteps, err := recipe.ParseRecipe(postBuildCmds)
+	if err != nil {
+		panic(fmt.Sprintf("invalid post-build recipe: %v", err))
+	}
+
 	// Execute build phases: pre-build → user recipe → post-build.
-	var logs []BuildLogEntry
+	// bctx carries working directory and env vars across all phases.
+	var logs []recipe.BuildLogEntry
 	step := 0
+	bctx := &recipe.ExecContext{}

-	// Helper to run a list of commands in a given phase.
-	// timeout=0 means no timeout (uses parent context).
-	runPhase := func(phase string, cmds []string, timeout time.Duration) bool {
-		for _, cmd := range cmds {
-			step++
-			log.Info("executing build step", "phase", phase, "step", step, "cmd", cmd)
-
-			execCtx := ctx
-			var cancel context.CancelFunc
-			// When no timeout is specified, use 10 minutes as a generous upper
-			// bound. The host agent defaults TimeoutSec=0 to 30s, so we must
-			// always send an explicit value.
-			effectiveTimeout := timeout
-			if effectiveTimeout <= 0 {
-				effectiveTimeout = 10 * time.Minute
-			}
-			execCtx, cancel = context.WithTimeout(ctx, effectiveTimeout)
-			timeoutSec := int32(effectiveTimeout.Seconds())
-
-			start := time.Now()
-			execResp, err := agent.Exec(execCtx, connect.NewRequest(&pb.ExecRequest{
-				SandboxId:  sandboxIDStr,
-				Cmd:        "/bin/sh",
-				Args:       []string{"-c", cmd},
-				TimeoutSec: timeoutSec,
-			}))
-			cancel()
-
-			entry := BuildLogEntry{
-				Step:    step,
-				Phase:   phase,
-				Cmd:     cmd,
-				Elapsed: time.Since(start).Milliseconds(),
-			}
-
-			if err != nil {
-				entry.Stderr = err.Error()
-				entry.Ok = false
-				logs = append(logs, entry)
-				s.updateLogs(ctx, buildID, step, logs)
-				s.destroySandbox(ctx, agent, sandboxIDStr)
-				s.failBuild(ctx, buildID, fmt.Sprintf("%s step %d failed: %v", phase, step, err))
+	runPhase := func(phase string, steps []recipe.Step, defaultTimeout time.Duration) bool {
+		newEntries, nextStep, ok := recipe.Execute(buildCtx, phase, steps, sandboxIDStr, step, defaultTimeout, bctx, agent.Exec)
+		logs = append(logs, newEntries...)
+		step = nextStep
+		s.updateLogs(buildCtx, buildID, step, logs)
+		if !ok {
+			s.destroySandbox(buildCtx, agent, sandboxIDStr)
+			// If the build was cancelled, status is already set — don't overwrite with "failed".
+			if buildCtx.Err() != nil {
 				return false
 			}
-
-			entry.Stdout = string(execResp.Msg.Stdout)
-			entry.Stderr = string(execResp.Msg.Stderr)
-			entry.Exit = execResp.Msg.ExitCode
-			entry.Ok = execResp.Msg.ExitCode == 0
-			logs = append(logs, entry)
-			s.updateLogs(ctx, buildID, step, logs)
-
-			if execResp.Msg.ExitCode != 0 {
-				s.destroySandbox(ctx, agent, sandboxIDStr)
-				s.failBuild(ctx, buildID, fmt.Sprintf("%s step %d failed with exit code %d", phase, step, execResp.Msg.ExitCode))
-				return false
+			last := newEntries[len(newEntries)-1]
+			reason := last.Stderr
+			if reason == "" {
+				reason = fmt.Sprintf("exit code %d", last.Exit)
 			}
+			s.failBuild(buildCtx, buildID, fmt.Sprintf("%s step %d failed: %s", phase, step, reason))
 		}
-		return true
+		return ok
 	}

-	if !runPhase("pre-build", preBuildCmds, 0) {
+	if !build.SkipPrePost {
+		if !runPhase("pre-build", preBuildSteps, 0) {
+			return
+		}
+	}
+	if !runPhase("recipe", userRecipeSteps, buildCommandTimeout) {
 		return
 	}
-	if !runPhase("recipe", recipe, buildCommandTimeout) {
-		return
-	}
-	if !runPhase("post-build", postBuildCmds, 0) {
-		return
+	if !build.SkipPrePost {
+		if !runPhase("post-build", postBuildSteps, 0) {
+			return
+		}
 	}

 	// Healthcheck or direct snapshot.
 	var sizeBytes int64
 	if build.Healthcheck != "" {
 		log.Info("running healthcheck", "cmd", build.Healthcheck)
-		if err := s.waitForHealthcheck(ctx, agent, sandboxIDStr, build.Healthcheck); err != nil {
-			s.destroySandbox(ctx, agent, sandboxIDStr)
-			s.failBuild(ctx, buildID, fmt.Sprintf("healthcheck failed: %v", err))
+		if err := s.waitForHealthcheck(buildCtx, agent, sandboxIDStr, build.Healthcheck); err != nil {
+			s.destroySandbox(buildCtx, agent, sandboxIDStr)
+			if buildCtx.Err() != nil {
+				return
+			}
+			s.failBuild(buildCtx, buildID, fmt.Sprintf("healthcheck failed: %v", err))
 			return
 		}

 		// Healthcheck passed → full snapshot (with memory/CPU state).
 		log.Info("healthcheck passed, creating snapshot")
-		snapResp, err := agent.CreateSnapshot(ctx, connect.NewRequest(&pb.CreateSnapshotRequest{
+		snapResp, err := agent.CreateSnapshot(buildCtx, connect.NewRequest(&pb.CreateSnapshotRequest{
 			SandboxId:  sandboxIDStr,
 			Name:       build.Name,
 			TeamId:     id.UUIDString(build.TeamID),
 			TemplateId: id.UUIDString(build.TemplateID),
 		}))
 		if err != nil {
-			s.destroySandbox(ctx, agent, sandboxIDStr)
-			s.failBuild(ctx, buildID, fmt.Sprintf("create snapshot failed: %v", err))
+			s.destroySandbox(buildCtx, agent, sandboxIDStr)
+			if buildCtx.Err() != nil {
+				return
+			}
+			s.failBuild(buildCtx, buildID, fmt.Sprintf("create snapshot failed: %v", err))
 			return
 		}
 		sizeBytes = snapResp.Msg.SizeBytes
 	} else {
 		// No healthcheck → image-only template (rootfs only).
 		log.Info("no healthcheck, flattening rootfs")
-		flatResp, err := agent.FlattenRootfs(ctx, connect.NewRequest(&pb.FlattenRootfsRequest{
+		flatResp, err := agent.FlattenRootfs(buildCtx, connect.NewRequest(&pb.FlattenRootfsRequest{
 			SandboxId:  sandboxIDStr,
 			Name:       build.Name,
 			TeamId:     id.UUIDString(build.TeamID),
 			TemplateId: id.UUIDString(build.TemplateID),
 		}))
 		if err != nil {
-			s.destroySandbox(ctx, agent, sandboxIDStr)
-			s.failBuild(ctx, buildID, fmt.Sprintf("flatten rootfs failed: %v", err))
+			s.destroySandbox(buildCtx, agent, sandboxIDStr)
+			if buildCtx.Err() != nil {
+				return
+			}
+			s.failBuild(buildCtx, buildID, fmt.Sprintf("flatten rootfs failed: %v", err))
 			return
 		}
 		sizeBytes = flatResp.Msg.SizeBytes
@ -368,7 +418,7 @@ func (s *BuildService) executeBuild(ctx context.Context, buildIDStr string) {
 		templateType = "snapshot"
 	}

-	if _, err := s.DB.InsertTemplate(ctx, db.InsertTemplateParams{
+	if _, err := s.DB.InsertTemplate(buildCtx, db.InsertTemplateParams{
 		ID:        build.TemplateID,
 		Name:      build.Name,
 		Type:      templateType,
@ -386,7 +436,7 @@ func (s *BuildService) executeBuild(ctx context.Context, buildIDStr string) {
 	// No additional destroy needed.

 	// Mark build as success.
-	if _, err := s.DB.UpdateBuildStatus(ctx, db.UpdateBuildStatusParams{
+	if _, err := s.DB.UpdateBuildStatus(buildCtx, db.UpdateBuildStatusParams{
 		ID: buildID, Status: "success",
 	}); err != nil {
 		log.Error("failed to mark build as success", "error", err)
@ -429,7 +479,7 @@ func (s *BuildService) waitForHealthcheck(ctx context.Context, agent buildAgentC
 	}
 }

-func (s *BuildService) updateLogs(ctx context.Context, buildID pgtype.UUID, step int, logs []BuildLogEntry) {
+func (s *BuildService) updateLogs(ctx context.Context, buildID pgtype.UUID, step int, logs []recipe.BuildLogEntry) {
 	logsJSON, err := json.Marshal(logs)
 	if err != nil {
 		slog.Warn("failed to marshal build logs", "error", err)