wrenn-releases/internal/recipe/executor.go

package recipe

import (
	"context"
	"fmt"
	"log/slog"
	"path"
	"strings"
	"time"

	"connectrpc.com/connect"

	pb "git.omukk.dev/wrenn/wrenn/proto/hostagent/gen"
)

// DefaultStepTimeout is the fallback timeout for RUN steps that carry no
// explicit --timeout flag.
const DefaultStepTimeout = 30 * time.Second

// BuildFilesDir is the directory inside the sandbox where uploaded build
// archives are extracted. COPY instructions reference paths relative to this.
const BuildFilesDir = "/tmp/build-files"

// BuildLogEntry is the per-step record stored in template_builds.logs (JSONB).
type BuildLogEntry struct {
	Step    int    `json:"step"`
	Phase   string `json:"phase"`
	Cmd     string `json:"cmd"`
	Stdout  string `json:"stdout"`
	Stderr  string `json:"stderr"`
	Exit    int32  `json:"exit"`
	Ok      bool   `json:"ok"`
	Elapsed int64  `json:"elapsed_ms"`
}

// ExecFunc is the agent.Exec call signature used by the executor. It matches
// the method on the hostagent Connect RPC client.
type ExecFunc func(ctx context.Context, req *connect.Request[pb.ExecRequest]) (*connect.Response[pb.ExecResponse], error)

// ProgressFunc is called after each step with the current step counter and
// accumulated log entries. Used for per-step DB progress updates.
type ProgressFunc func(step int, entries []BuildLogEntry)

// Execute runs steps sequentially against sandboxID using execFn.
//
//   - phase labels the log entries (e.g., "pre-build", "recipe", "post-build").
//   - startStep is the 1-based offset so entries are globally numbered across phases.
//   - defaultTimeout applies to RUN steps with no per-step --timeout; 0 → 10 minutes.
//   - bctx is mutated in place as ENV/WORKDIR/USER steps execute, and carries forward
//     into subsequent phases when the caller passes the same pointer.
//   - onProgress is called after each step for live progress updates (may be nil).
//
// Returns all log entries appended during this call, the next step counter
// value, and whether all steps succeeded. On false the last entry contains
// failure details; the caller is responsible for destroying the sandbox and
// recording the build error.
func Execute(
	ctx context.Context,
	phase string,
	steps []Step,
	sandboxID string,
	startStep int,
	defaultTimeout time.Duration,
	bctx *ExecContext,
	execFn ExecFunc,
	onProgress ProgressFunc,
) (entries []BuildLogEntry, nextStep int, ok bool) {
	if defaultTimeout <= 0 {
		defaultTimeout = 10 * time.Minute
	}

	step := startStep
	for _, st := range steps {
		step++
		slog.Info("executing build step", "phase", phase, "step", step, "instruction", st.Raw)

		switch st.Kind {
		case KindENV:
			if bctx.EnvVars == nil {
				bctx.EnvVars = make(map[string]string)
			}
			bctx.EnvVars[st.Key] = expandEnv(st.Value, bctx.EnvVars)
			entries = append(entries, BuildLogEntry{Step: step, Phase: phase, Cmd: st.Raw, Ok: true})

		case KindWORKDIR:
			// Create the directory if it doesn't exist.
			mkdirEntry := execRawShell(ctx, st.Raw, sandboxID, phase, step, 10*time.Second, execFn,
				"mkdir -p "+shellescape(st.Path))
			if !mkdirEntry.Ok {
				entries = append(entries, mkdirEntry)
				return entries, step, false
			}
			bctx.WorkDir = st.Path
			mkdirEntry.Ok = true
			entries = append(entries, mkdirEntry)

		case KindUSER:
			entry, succeeded := execUser(ctx, st, sandboxID, phase, step, bctx, execFn)
			entries = append(entries, entry)
			if !succeeded {
				return entries, step, false
			}

		case KindCOPY:
			entry, succeeded := execCopy(ctx, st, sandboxID, phase, step, bctx, execFn)
			entries = append(entries, entry)
			if !succeeded {
				return entries, step, false
			}

		case KindSTART:
			entry, succeeded := execStart(ctx, st, sandboxID, phase, step, bctx, execFn)
			entries = append(entries, entry)
			if !succeeded {
				return entries, step, false
			}

		case KindRUN:
			timeout := defaultTimeout
			if st.Timeout > 0 {
				timeout = st.Timeout
			}
			entry, succeeded := execRun(ctx, st, sandboxID, phase, step, timeout, bctx, execFn)
			entries = append(entries, entry)
			if !succeeded {
				return entries, step, false
			}
		}

		if onProgress != nil {
			onProgress(step, entries)
		}
	}
	return entries, step, true
}

func execRun(
	ctx context.Context,
	st Step,
	sandboxID, phase string,
	step int,
	timeout time.Duration,
	bctx *ExecContext,
	execFn ExecFunc,
) (BuildLogEntry, bool) {
	execCtx, cancel := context.WithTimeout(ctx, timeout)
	defer cancel()

	start := time.Now()
	resp, err := execFn(execCtx, connect.NewRequest(&pb.ExecRequest{
		SandboxId:  sandboxID,
		Cmd:        "/bin/sh",
		Args:       []string{"-c", bctx.WrappedCommand(st.Shell)},
		TimeoutSec: int32(timeout.Seconds()),
	}))

	entry := BuildLogEntry{
		Step:    step,
		Phase:   phase,
		Cmd:     st.Raw,
		Elapsed: time.Since(start).Milliseconds(),
	}
	if err != nil {
		entry.Stderr = fmt.Sprintf("exec error: %v", err)
		return entry, false
	}
	entry.Stdout = string(resp.Msg.Stdout)
	entry.Stderr = string(resp.Msg.Stderr)
	entry.Exit = resp.Msg.ExitCode
	entry.Ok = resp.Msg.ExitCode == 0
	return entry, entry.Ok
}

// execUser creates a unix user (if not exists), grants passwordless sudo,
// and updates bctx.User for subsequent steps.
func execUser(
	ctx context.Context,
	st Step,
	sandboxID, phase string,
	step int,
	bctx *ExecContext,
	execFn ExecFunc,
) (BuildLogEntry, bool) {
	username := st.Key
	// Create user if not exists, with home directory and bash shell.
	// Grant passwordless sudo access (E2B convention).
	// Uses printf %s to avoid shell injection in the sudoers line.
	script := fmt.Sprintf(
		"id %s >/dev/null 2>&1 || (adduser --disabled-password --gecos '' --shell /bin/bash %s && printf '%%s ALL=(ALL) NOPASSWD:ALL\\n' %s >> /etc/sudoers)",
		shellescape(username), shellescape(username), shellescape(username),
	)

	entry := execRawShell(ctx, st.Raw, sandboxID, phase, step, 30*time.Second, execFn, script)
	if entry.Ok {
		bctx.User = username
		// Update HOME so ~ expands correctly in subsequent RUN/WORKDIR steps.
		if bctx.EnvVars == nil {
			bctx.EnvVars = make(map[string]string)
		}
		if username == "root" {
			bctx.EnvVars["HOME"] = "/root"
		} else {
			bctx.EnvVars["HOME"] = "/home/" + username
		}
	}
	return entry, entry.Ok
}

// execCopy copies a file or directory from the build archive (extracted at
// BuildFilesDir) to the destination path inside the sandbox. Ownership is
// set to the current user from bctx.
func execCopy(
	ctx context.Context,
	st Step,
	sandboxID, phase string,
	step int,
	bctx *ExecContext,
	execFn ExecFunc,
) (BuildLogEntry, bool) {
	// Validate all source paths: must be relative and not escape the archive directory.
	var srcPaths []string
	for _, s := range st.Srcs {
		cleaned := path.Clean(s)
		if strings.HasPrefix(cleaned, "..") || strings.HasPrefix(cleaned, "/") {
			return BuildLogEntry{
				Step:   step,
				Phase:  phase,
				Cmd:    st.Raw,
				Stderr: fmt.Sprintf("COPY source must be a relative path within the archive: %q", s),
			}, false
		}
		srcPaths = append(srcPaths, shellescape(BuildFilesDir+"/"+cleaned))
	}

	dst := st.Dst
	// Resolve relative destination against the current WORKDIR.
	if dst != "" && dst[0] != '/' && bctx.WorkDir != "" {
		dst = bctx.WorkDir + "/" + dst
	}
	owner := "root"
	if bctx.User != "" {
		owner = bctx.User
	}
	script := fmt.Sprintf(
		"cp -r %s %s && chown -R %s:%s %s",
		strings.Join(srcPaths, " "), shellescape(dst), shellescape(owner), shellescape(owner), shellescape(dst),
	)

	entry := execRawShell(ctx, st.Raw, sandboxID, phase, step, 60*time.Second, execFn, script)
	return entry, entry.Ok
}

// execRawShell runs a shell command directly (as root) without ExecContext
// wrapping. Used for internal operations like user creation and file copy.
func execRawShell(
	ctx context.Context,
	raw, sandboxID, phase string,
	step int,
	timeout time.Duration,
	execFn ExecFunc,
	shellCmd string,
) BuildLogEntry {
	execCtx, cancel := context.WithTimeout(ctx, timeout)
	defer cancel()

	start := time.Now()
	resp, err := execFn(execCtx, connect.NewRequest(&pb.ExecRequest{
		SandboxId:  sandboxID,
		Cmd:        "/bin/sh",
		Args:       []string{"-c", shellCmd},
		TimeoutSec: int32(timeout.Seconds()),
	}))

	entry := BuildLogEntry{
		Step:    step,
		Phase:   phase,
		Cmd:     raw,
		Elapsed: time.Since(start).Milliseconds(),
	}
	if err != nil {
		entry.Stderr = fmt.Sprintf("exec error: %v", err)
		return entry
	}
	entry.Stdout = string(resp.Msg.Stdout)
	entry.Stderr = string(resp.Msg.Stderr)
	entry.Exit = resp.Msg.ExitCode
	entry.Ok = resp.Msg.ExitCode == 0
	return entry
}

func execStart(
	ctx context.Context,
	st Step,
	sandboxID, phase string,
	step int,
	bctx *ExecContext,
	execFn ExecFunc,
) (BuildLogEntry, bool) {
	// START uses a short timeout: just long enough for the shell to fork and
	// return. The background process itself runs indefinitely inside the VM.
	execCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
	defer cancel()

	start := time.Now()
	resp, err := execFn(execCtx, connect.NewRequest(&pb.ExecRequest{
		SandboxId:  sandboxID,
		Cmd:        "/bin/sh",
		Args:       []string{"-c", bctx.StartCommand(st.Shell)},
		TimeoutSec: 10,
	}))

	entry := BuildLogEntry{
		Step:    step,
		Phase:   phase,
		Cmd:     st.Raw,
		Elapsed: time.Since(start).Milliseconds(),
	}
	if err != nil {
		entry.Stderr = fmt.Sprintf("start error: %v", err)
		return entry, false
	}
	entry.Exit = resp.Msg.ExitCode
	entry.Ok = resp.Msg.ExitCode == 0
	if !entry.Ok {
		entry.Stderr = fmt.Sprintf("start failed with exit code %d: %s", resp.Msg.ExitCode, string(resp.Msg.Stderr))
	}
	return entry, entry.Ok
}