feat: add env expansion, sandbox env fetching, and configurable

healthchecks Fix ENV instructions to expand $VAR references at set time using the current env state, preventing self-referencing values like PATH=/opt/venv/bin:$PATH from producing recursive expansions. Remove expandEnv from shellPrefix to avoid double expansion. Fetch sandbox environment variables via `env` before recipe execution so ENV steps resolve against actual runtime values from the base template image. Replace hardcoded healthcheck timing with a Dockerfile-like flag parser supporting --interval, --timeout, --start-period, and --retries. Add start-period grace window and bounded retry counting to waitForHealthcheck. Add python-interpreter-v0-beta recipe and healthcheck files.
2026-04-07 01:15:43 +06:00
parent ab38c8372c
commit 4f340b8847
10 changed files with 537 additions and 17 deletions
--- a/internal/service/build.go
+++ b/internal/service/build.go
@ -5,6 +5,7 @@ import (
 	"encoding/json"
 	"fmt"
 	"log/slog"
+	"strings"
 	"sync"
 	"time"

@ -23,8 +24,6 @@ import (
 const (
 	buildQueueKey       = "wrenn:build_queue"
 	buildCommandTimeout = 30 * time.Second
-	healthcheckInterval = 1 * time.Second
-	healthcheckTimeout  = 60 * time.Second
 )

 // preBuildCmds run before the user recipe to prepare the build environment.
@ -321,11 +320,18 @@ func (s *BuildService) executeBuild(ctx context.Context, buildIDStr string) {
 		panic(fmt.Sprintf("invalid post-build recipe: %v", err))
 	}

-	// Execute build phases: pre-build → user recipe → post-build.
-	// bctx carries working directory and env vars across all phases.
 	var logs []recipe.BuildLogEntry
 	step := 0
-	bctx := &recipe.ExecContext{}
+
+	envVars, err := s.fetchSandboxEnv(buildCtx, agent, sandboxIDStr)
+	if err != nil {
+		log.Warn("failed to fetch sandbox env, using defaults", "error", err)
+		envVars = map[string]string{
+			"PATH": "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
+			"HOME": "/root",
+		}
+	}
+	bctx := &recipe.ExecContext{EnvVars: envVars}

 	runPhase := func(phase string, steps []recipe.Step, defaultTimeout time.Duration) bool {
 		newEntries, nextStep, ok := recipe.Execute(buildCtx, phase, steps, sandboxIDStr, step, defaultTimeout, bctx, agent.Exec)
@ -365,8 +371,14 @@ func (s *BuildService) executeBuild(ctx context.Context, buildIDStr string) {
 	// Healthcheck or direct snapshot.
 	var sizeBytes int64
 	if build.Healthcheck != "" {
-		log.Info("running healthcheck", "cmd", build.Healthcheck)
-		if err := s.waitForHealthcheck(buildCtx, agent, sandboxIDStr, build.Healthcheck); err != nil {
+		hc, err := recipe.ParseHealthcheck(build.Healthcheck)
+		if err != nil {
+			s.destroySandbox(buildCtx, agent, sandboxIDStr)
+			s.failBuild(buildCtx, buildID, fmt.Sprintf("invalid healthcheck: %v", err))
+			return
+		}
+		log.Info("running healthcheck", "cmd", hc.Cmd, "interval", hc.Interval, "timeout", hc.Timeout, "start_period", hc.StartPeriod, "retries", hc.Retries)
+		if err := s.waitForHealthcheck(buildCtx, agent, sandboxIDStr, hc); err != nil {
 			s.destroySandbox(buildCtx, agent, sandboxIDStr)
 			if buildCtx.Err() != nil {
 				return
@ -445,36 +457,61 @@ func (s *BuildService) executeBuild(ctx context.Context, buildIDStr string) {
 	log.Info("template build completed successfully", "name", build.Name)
 }

-func (s *BuildService) waitForHealthcheck(ctx context.Context, agent buildAgentClient, sandboxIDStr, cmd string) error {
-	deadline := time.NewTimer(healthcheckTimeout)
+// waitForHealthcheck repeatedly executes the healthcheck command inside the
+// sandbox according to the config's interval, timeout, start-period, and
+// retries.
+// During the start period, failures are not counted toward the retry budget.
+// Returns nil on the first successful check, or an error if retries are
+// exhausted, the deadline passes, or the context is cancelled.
+func (s *BuildService) waitForHealthcheck(ctx context.Context, agent buildAgentClient, sandboxIDStr string, hc recipe.HealthcheckConfig) error {
+	maxAttempts := 100
+	if hc.Retries > 0 {
+		maxAttempts = hc.Retries
+	}
+	deadline := time.NewTimer(hc.StartPeriod + time.Duration(maxAttempts+1)*hc.Interval)
 	defer deadline.Stop()
-	ticker := time.NewTicker(healthcheckInterval)
+	ticker := time.NewTicker(hc.Interval)
 	defer ticker.Stop()

+	startedAt := time.Now()
+	failCount := 0
+
 	for {
 		select {
 		case <-ctx.Done():
 			return ctx.Err()
 		case <-deadline.C:
-			return fmt.Errorf("healthcheck timed out after %s", healthcheckTimeout)
+			return fmt.Errorf("healthcheck timed out: exceeded %d attempts over %s", failCount, time.Since(startedAt))
 		case <-ticker.C:
-			execCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
+			execCtx, cancel := context.WithTimeout(ctx, hc.Timeout)
 			resp, err := agent.Exec(execCtx, connect.NewRequest(&pb.ExecRequest{
 				SandboxId:  sandboxIDStr,
 				Cmd:        "/bin/sh",
-				Args:       []string{"-c", cmd},
-				TimeoutSec: 10,
+				Args:       []string{"-c", hc.Cmd},
+				TimeoutSec: int32(hc.Timeout.Seconds()),
 			}))
 			cancel()

 			if err != nil {
 				slog.Debug("healthcheck exec error (retrying)", "error", err)
+				if time.Since(startedAt) >= hc.StartPeriod {
+					failCount++
+					if hc.Retries > 0 && failCount >= hc.Retries {
+						return fmt.Errorf("healthcheck failed after %d retries: exec error: %v", failCount, err)
+					}
+				}
 				continue
 			}
 			if resp.Msg.ExitCode == 0 {
 				return nil
 			}
 			slog.Debug("healthcheck failed (retrying)", "exit_code", resp.Msg.ExitCode)
+			if time.Since(startedAt) >= hc.StartPeriod {
+				failCount++
+				if hc.Retries > 0 && failCount >= hc.Retries {
+					return fmt.Errorf("healthcheck failed after %d retries: exit code %d", failCount, resp.Msg.ExitCode)
+				}
+			}
 		}
 	}
 }
@ -517,3 +554,49 @@ func (s *BuildService) destroySandbox(_ context.Context, agent buildAgentClient,
 		slog.Warn("failed to destroy build sandbox", "sandbox_id", sandboxIDStr, "error", err)
 	}
 }
+
+// fetchSandboxEnv executes the 'env' command inside the specified sandbox via
+// the build agent and returns environment variables
+func (s *BuildService) fetchSandboxEnv(ctx context.Context,
+	agent buildAgentClient, sandboxIDStr string) (map[string]string, error) {
+	resp, err := agent.Exec(ctx, connect.NewRequest(&pb.ExecRequest{
+		SandboxId:  sandboxIDStr,
+		Cmd:        "/bin/sh",
+		Args:       []string{"-c", "env"},
+		TimeoutSec: 10,
+	}))
+	if err != nil {
+		return nil, fmt.Errorf("fetch env: %w", err)
+	}
+
+	if resp.Msg.ExitCode != 0 {
+		return nil, fmt.Errorf("fetch env: command exited with code %d",
+			resp.Msg.ExitCode)
+	}
+
+	return s.parseSandboxEnv(string(resp.Msg.Stdout)), nil
+}
+
+// parseSandboxEnv converts the raw newline-separated output of an 'env'
+// command into a map.
+// It skips empty lines and malformed entries, and correctly handles value
+// containing '='.
+func (s *BuildService) parseSandboxEnv(raw string) map[string]string {
+	envVars := make(map[string]string)
+
+	for line := range strings.SplitSeq(raw, "\n") {
+		line = strings.TrimSpace(line)
+		if line == "" {
+			continue
+		}
+
+		parts := strings.SplitN(line, "=", 2)
+		if len(parts) != 2 {
+			continue
+		}
+
+		envVars[parts[0]] = parts[1]
+	}
+
+	return envVars
+}