feat: add env expansion, sandbox env fetching, and configurable

healthchecks Fix ENV instructions to expand $VAR references at set time using the current env state, preventing self-referencing values like PATH=/opt/venv/bin:$PATH from producing recursive expansions. Remove expandEnv from shellPrefix to avoid double expansion. Fetch sandbox environment variables via `env` before recipe execution so ENV steps resolve against actual runtime values from the base template image. Replace hardcoded healthcheck timing with a Dockerfile-like flag parser supporting --interval, --timeout, --start-period, and --retries. Add start-period grace window and bounded retry counting to waitForHealthcheck. Add python-interpreter-v0-beta recipe and healthcheck files.
2026-04-07 01:15:43 +06:00
parent ab38c8372c
commit 4f340b8847
10 changed files with 537 additions and 17 deletions
--- a/internal/recipe/context.go
+++ b/internal/recipe/context.go
@ -1,6 +1,8 @@
 package recipe

-import "strings"
+import (
+	"strings"
+)

 // ExecContext holds mutable state that persists across recipe steps.
 // It is initialized empty and updated by ENV and WORKDIR steps.
@ -56,6 +58,74 @@ func (c *ExecContext) shellPrefix() string {
 	return sb.String()
 }

+// expandEnv replaces $var and ${var} placeholders in the string s with their
+// corresponding values from the vars map.
+// It supports escaping with $$, which is replaced by a single $.
+// If a variable is not found in the vars map, it is replaced with an empty
+// string.
+func expandEnv(s string, vars map[string]string) string {
+	var sb strings.Builder
+	sb.Grow(len(s) * 2)
+
+	for {
+		idx := strings.IndexByte(s, '$')
+		if idx < 0 {
+			sb.WriteString(s)
+			break
+		}
+
+		sb.WriteString(s[:idx])
+		s = s[idx:]
+
+		if len(s) == 1 {
+			sb.WriteByte('$')
+			break
+		}
+
+		if s[1] == '$' {
+			sb.WriteByte('$')
+			s = s[2:]
+			continue
+		}
+
+		var name string
+		var advance int
+
+		if s[1] == '{' {
+			end := strings.IndexByte(s[2:], '}')
+			if end < 0 {
+				sb.WriteByte('$')
+				s = s[1:]
+				continue
+			}
+			name = s[2 : 2+end]
+			advance = 2 + end + 1
+		} else {
+			j := 1
+			for j < len(s) && isNameChar(s[j]) {
+				j++
+			}
+			name = s[1:j]
+			advance = j
+		}
+
+		if v, ok := vars[name]; ok {
+			sb.WriteString(v)
+		}
+
+		s = s[advance:]
+	}
+
+	return sb.String()
+}
+
+// isNameChar reports whether the byte c is a valid character for an
+// environment variable name (alphanumeric or underscore)
+func isNameChar(c byte) bool {
+	return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
+		(c >= '0' && c <= '9') || c == '_'
+}
+
 // shellescape wraps s in single quotes, escaping any embedded single quotes.
 // This is POSIX-safe for paths, env values, and shell commands.
 func shellescape(s string) string {
--- a/internal/recipe/context_test.go
+++ b/internal/recipe/context_test.go
@ -45,6 +45,14 @@ func TestExecContext_WrappedCommand(t *testing.T) {
 			cmd:  "echo $MSG",
 			want: "MSG='it'\\''s fine' /bin/sh -c 'echo $MSG'",
 		},
+		{
+			name: "env expansion with dollar sign PATH",
+			ctx: ExecContext{
+				EnvVars: map[string]string{"PATH": "/usr/bin", "FOO": "/opt/venv/bin:$PATH"},
+			},
+			cmd:  "make build",
+			want: "FOO='/opt/venv/bin:/usr/bin' PATH='/usr/bin' /bin/sh -c 'make build'",
+		},
 	}

 	for _, tc := range tests {
@ -94,6 +102,109 @@ func TestExecContext_StartCommand(t *testing.T) {
 	}
 }

+func TestExpandEnv(t *testing.T) {
+	tests := []struct {
+		s    string
+		vars map[string]string
+		want string
+	}{
+		{
+			s:    "hello",
+			vars: nil,
+			want: "hello",
+		},
+		{
+			s:    "$PATH",
+			vars: map[string]string{"PATH": "/usr/bin"},
+			want: "/usr/bin",
+		},
+		{
+			s:    "${PATH}",
+			vars: map[string]string{"PATH": "/usr/bin"},
+			want: "/usr/bin",
+		},
+		{
+			s:    "/opt/venv/bin:$PATH",
+			vars: map[string]string{"PATH": "/usr/bin"},
+			want: "/opt/venv/bin:/usr/bin",
+		},
+		{
+			s:    "${HOME}/code",
+			vars: map[string]string{"HOME": "/root"},
+			want: "/root/code",
+		},
+		{
+			s:    "hello $USER",
+			vars: map[string]string{"USER": "admin"},
+			want: "hello admin",
+		},
+		{
+			s:    "$UNSET",
+			vars: map[string]string{"PATH": "/usr/bin"},
+			want: "",
+		},
+		{
+			s:    "${UNSET}",
+			vars: map[string]string{"PATH": "/usr/bin"},
+			want: "",
+		},
+		{
+			s:    "$$",
+			vars: map[string]string{"PATH": "/usr/bin"},
+			want: "$",
+		},
+		{
+			s:    "price is $$100",
+			vars: nil,
+			want: "price is $100",
+		},
+		{
+			s:    "$FOO:$BAR",
+			vars: map[string]string{"FOO": "a", "BAR": "b"},
+			want: "a:b",
+		},
+		{
+			s:    "${FOO}_${BAR}",
+			vars: map[string]string{"FOO": "hello", "BAR": "world"},
+			want: "hello_world",
+		},
+		{
+			s:    "no vars here",
+			vars: nil,
+			want: "no vars here",
+		},
+		{
+			s:    "$",
+			vars: nil,
+			want: "$",
+		},
+		{
+			s:    "${",
+			vars: nil,
+			want: "${",
+		},
+		{
+			s:    "${}",
+			vars: nil,
+			want: "",
+		},
+		{
+			s:    "$VAR1$VAR2",
+			vars: map[string]string{"VAR1": "a", "VAR2": "b"},
+			want: "ab",
+		},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.s, func(t *testing.T) {
+			got := expandEnv(tc.s, tc.vars)
+			if got != tc.want {
+				t.Errorf("expandEnv(%q, %v)\n  got  %q\n  want %q", tc.s, tc.vars, got, tc.want)
+			}
+		})
+	}
+}
+
 func TestShellescape(t *testing.T) {
 	tests := []struct {
 		input string
--- a/internal/recipe/executor.go
+++ b/internal/recipe/executor.go
@ -68,7 +68,7 @@ func Execute(
 			if bctx.EnvVars == nil {
 				bctx.EnvVars = make(map[string]string)
 			}
-			bctx.EnvVars[st.Key] = st.Value
+			bctx.EnvVars[st.Key] = expandEnv(st.Value, bctx.EnvVars)
 			entries = append(entries, BuildLogEntry{Step: step, Phase: phase, Cmd: st.Raw, Ok: true})

 		case KindWORKDIR:
--- a/internal/recipe/healthcheck.go
+++ b/internal/recipe/healthcheck.go
@ -0,0 +1,94 @@
+package recipe
+
+import (
+	"fmt"
+	"strconv"
+	"strings"
+	"time"
+)
+
+// HealthcheckConfig holds the parsed configuration for a build healthcheck.
+// A healthcheck is a shell command that is executed repeatedly inside the
+// sandbox until it succeeds or the retry/timeout budget is exhausted.
+//
+// Retries of 0 means unlimited retries (bounded only by the overall deadline)
+type HealthcheckConfig struct {
+	Cmd         string
+	Interval    time.Duration
+	Timeout     time.Duration
+	StartPeriod time.Duration
+	Retries     int // 0 = unlimited
+}
+
+// ParseHealthcheck parses a healthcheck string with optional flag prefix into
+// a HealthcheckConfig. The syntax is:
+//
+// [--interval=<duration>] [--timeout=<duration>] [--start-period=<duration>]
+// [--retries=<n>] <command>
+//
+// Flags must use the form --flag=value. The first token that does not start
+// with "--" and everything after it is treated as the command. Defaults:
+// interval=3s, timeout=10s, start-period=0, retries=0 (unlimited)
+func ParseHealthcheck(s string) (HealthcheckConfig, error) {
+	s = strings.TrimSpace(s)
+	if s == "" {
+		return HealthcheckConfig{}, fmt.Errorf("empty healthcheck")
+	}
+
+	hc := HealthcheckConfig{
+		Interval: 3 * time.Second,
+		Timeout:  10 * time.Second,
+	}
+
+	tokens := strings.Fields(s)
+	cmdIndex := -1
+
+	for i, token := range tokens {
+		if !strings.HasPrefix(token, "--") {
+			cmdIndex = i
+			break
+		}
+
+		parts := strings.SplitN(token, "=", 2)
+		if len(parts) != 2 {
+			return HealthcheckConfig{}, fmt.Errorf("malformed flag (missing '='): %q", token)
+		}
+
+		key, val := parts[0], parts[1]
+		switch key {
+		case "--interval":
+			d, err := time.ParseDuration(val)
+			if err != nil {
+				return HealthcheckConfig{}, fmt.Errorf("parse interval: %w", err)
+			}
+			hc.Interval = d
+		case "--timeout":
+			d, err := time.ParseDuration(val)
+			if err != nil {
+				return HealthcheckConfig{}, fmt.Errorf("parse timeout: %w", err)
+			}
+			hc.Timeout = d
+		case "--start-period":
+			d, err := time.ParseDuration(val)
+			if err != nil {
+				return HealthcheckConfig{}, fmt.Errorf("parse start period: %w", err)
+			}
+			hc.StartPeriod = d
+		case "--retries":
+			r, err := strconv.Atoi(val)
+			if err != nil {
+				return HealthcheckConfig{}, fmt.Errorf("parse retries: %w", err)
+			}
+			hc.Retries = r
+		default:
+			return HealthcheckConfig{}, fmt.Errorf("unknown healthcheck flag: %q", token)
+		}
+	}
+
+	if cmdIndex == -1 {
+		return HealthcheckConfig{}, fmt.Errorf("healthcheck has no command")
+	}
+
+	hc.Cmd = strings.Join(tokens[cmdIndex:], " ")
+	return hc, nil
+}
--- a/internal/recipe/healthcheck_test.go
+++ b/internal/recipe/healthcheck_test.go
@ -0,0 +1,126 @@
+package recipe
+
+import (
+	"testing"
+	"time"
+)
+
+func TestParseHealthcheck(t *testing.T) {
+	tests := []struct {
+		name    string
+		input   string
+		want    HealthcheckConfig
+		wantErr bool
+	}{
+		{
+			name:  "plain command",
+			input: "curl -f http://localhost:8080",
+			want: HealthcheckConfig{
+				Cmd:      "curl -f http://localhost:8080",
+				Interval: 3 * time.Second,
+				Timeout:  10 * time.Second,
+			},
+			wantErr: false,
+		},
+		{
+			name:  "all flags",
+			input: "--interval=5s --timeout=2s --start-period=15s --retries=3 ping -c 1 8.8.8.8",
+			want: HealthcheckConfig{
+				Cmd:         "ping -c 1 8.8.8.8",
+				Interval:    5 * time.Second,
+				Timeout:     2 * time.Second,
+				StartPeriod: 15 * time.Second,
+				Retries:     3,
+			},
+			wantErr: false,
+		},
+		{
+			name:  "partial flags",
+			input: "--timeout=5s my-custom-check --verbose",
+			want: HealthcheckConfig{
+				Cmd:      "my-custom-check --verbose",
+				Interval: 3 * time.Second,
+				Timeout:  5 * time.Second,
+			},
+			wantErr: false,
+		},
+		{
+			name:  "retries only",
+			input: "--retries=5 test.sh",
+			want: HealthcheckConfig{
+				Cmd:      "test.sh",
+				Interval: 3 * time.Second,
+				Timeout:  10 * time.Second,
+				Retries:  5,
+			},
+			wantErr: false,
+		},
+		{
+			name:    "empty string",
+			input:   "",
+			wantErr: true,
+		},
+		{
+			name:    "whitespace only",
+			input:   "   \t  \n ",
+			wantErr: true,
+		},
+		{
+			name:    "flags but no command",
+			input:   "--interval=5s --retries=2",
+			wantErr: true,
+		},
+		{
+			name:    "unknown flag",
+			input:   "--magic=true my-check",
+			wantErr: true,
+		},
+		{
+			name:    "invalid duration",
+			input:   "--interval=5smiles check.sh",
+			wantErr: true,
+		},
+		{
+			name:    "invalid retries",
+			input:   "--retries=five check.sh",
+			wantErr: true,
+		},
+		{
+			name:  "command with dashes",
+			input: "--interval=2s command-with-dash --flag=value",
+			want: HealthcheckConfig{
+				Cmd:      "command-with-dash --flag=value",
+				Interval: 2 * time.Second,
+				Timeout:  10 * time.Second,
+			},
+			wantErr: false,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got, err := ParseHealthcheck(tt.input)
+			if (err != nil) != tt.wantErr {
+				t.Errorf("ParseHealthcheck() error = %v, wantErr %v", err, tt.wantErr)
+				return
+			}
+			if !tt.wantErr {
+				if got.Cmd != tt.want.Cmd {
+					t.Errorf("Cmd got = %v, want %v", got.Cmd, tt.want.Cmd)
+				}
+				if got.Interval != tt.want.Interval {
+					t.Errorf("Interval got = %v, want %v", got.Interval, tt.want.Interval)
+				}
+				if got.Timeout != tt.want.Timeout {
+					t.Errorf("Timeout got = %v, want %v", got.Timeout, tt.want.Timeout)
+				}
+				if got.StartPeriod != tt.want.StartPeriod {
+					t.Errorf("StartPeriod got = %v, want %v", got.StartPeriod, tt.want.StartPeriod)
+				}
+				if got.Retries != tt.want.Retries {
+					t.Errorf("Retries got = %v, want %v", got.Retries, tt.want.Retries)
+				}
+			}
+		})
+	}
+}
--- a/internal/service/build.go
+++ b/internal/service/build.go
@ -5,6 +5,7 @@ import (
 	"encoding/json"
 	"fmt"
 	"log/slog"
+	"strings"
 	"sync"
 	"time"

@ -23,8 +24,6 @@ import (
 const (
 	buildQueueKey       = "wrenn:build_queue"
 	buildCommandTimeout = 30 * time.Second
-	healthcheckInterval = 1 * time.Second
-	healthcheckTimeout  = 60 * time.Second
 )

 // preBuildCmds run before the user recipe to prepare the build environment.
@ -321,11 +320,18 @@ func (s *BuildService) executeBuild(ctx context.Context, buildIDStr string) {
 		panic(fmt.Sprintf("invalid post-build recipe: %v", err))
 	}

-	// Execute build phases: pre-build → user recipe → post-build.
-	// bctx carries working directory and env vars across all phases.
 	var logs []recipe.BuildLogEntry
 	step := 0
-	bctx := &recipe.ExecContext{}
+
+	envVars, err := s.fetchSandboxEnv(buildCtx, agent, sandboxIDStr)
+	if err != nil {
+		log.Warn("failed to fetch sandbox env, using defaults", "error", err)
+		envVars = map[string]string{
+			"PATH": "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
+			"HOME": "/root",
+		}
+	}
+	bctx := &recipe.ExecContext{EnvVars: envVars}

 	runPhase := func(phase string, steps []recipe.Step, defaultTimeout time.Duration) bool {
 		newEntries, nextStep, ok := recipe.Execute(buildCtx, phase, steps, sandboxIDStr, step, defaultTimeout, bctx, agent.Exec)
@ -365,8 +371,14 @@ func (s *BuildService) executeBuild(ctx context.Context, buildIDStr string) {
 	// Healthcheck or direct snapshot.
 	var sizeBytes int64
 	if build.Healthcheck != "" {
-		log.Info("running healthcheck", "cmd", build.Healthcheck)
-		if err := s.waitForHealthcheck(buildCtx, agent, sandboxIDStr, build.Healthcheck); err != nil {
+		hc, err := recipe.ParseHealthcheck(build.Healthcheck)
+		if err != nil {
+			s.destroySandbox(buildCtx, agent, sandboxIDStr)
+			s.failBuild(buildCtx, buildID, fmt.Sprintf("invalid healthcheck: %v", err))
+			return
+		}
+		log.Info("running healthcheck", "cmd", hc.Cmd, "interval", hc.Interval, "timeout", hc.Timeout, "start_period", hc.StartPeriod, "retries", hc.Retries)
+		if err := s.waitForHealthcheck(buildCtx, agent, sandboxIDStr, hc); err != nil {
 			s.destroySandbox(buildCtx, agent, sandboxIDStr)
 			if buildCtx.Err() != nil {
 				return
@ -445,36 +457,61 @@ func (s *BuildService) executeBuild(ctx context.Context, buildIDStr string) {
 	log.Info("template build completed successfully", "name", build.Name)
 }

-func (s *BuildService) waitForHealthcheck(ctx context.Context, agent buildAgentClient, sandboxIDStr, cmd string) error {
-	deadline := time.NewTimer(healthcheckTimeout)
+// waitForHealthcheck repeatedly executes the healthcheck command inside the
+// sandbox according to the config's interval, timeout, start-period, and
+// retries.
+// During the start period, failures are not counted toward the retry budget.
+// Returns nil on the first successful check, or an error if retries are
+// exhausted, the deadline passes, or the context is cancelled.
+func (s *BuildService) waitForHealthcheck(ctx context.Context, agent buildAgentClient, sandboxIDStr string, hc recipe.HealthcheckConfig) error {
+	maxAttempts := 100
+	if hc.Retries > 0 {
+		maxAttempts = hc.Retries
+	}
+	deadline := time.NewTimer(hc.StartPeriod + time.Duration(maxAttempts+1)*hc.Interval)
 	defer deadline.Stop()
-	ticker := time.NewTicker(healthcheckInterval)
+	ticker := time.NewTicker(hc.Interval)
 	defer ticker.Stop()

+	startedAt := time.Now()
+	failCount := 0
+
 	for {
 		select {
 		case <-ctx.Done():
 			return ctx.Err()
 		case <-deadline.C:
-			return fmt.Errorf("healthcheck timed out after %s", healthcheckTimeout)
+			return fmt.Errorf("healthcheck timed out: exceeded %d attempts over %s", failCount, time.Since(startedAt))
 		case <-ticker.C:
-			execCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
+			execCtx, cancel := context.WithTimeout(ctx, hc.Timeout)
 			resp, err := agent.Exec(execCtx, connect.NewRequest(&pb.ExecRequest{
 				SandboxId:  sandboxIDStr,
 				Cmd:        "/bin/sh",
-				Args:       []string{"-c", cmd},
-				TimeoutSec: 10,
+				Args:       []string{"-c", hc.Cmd},
+				TimeoutSec: int32(hc.Timeout.Seconds()),
 			}))
 			cancel()

 			if err != nil {
 				slog.Debug("healthcheck exec error (retrying)", "error", err)
+				if time.Since(startedAt) >= hc.StartPeriod {
+					failCount++
+					if hc.Retries > 0 && failCount >= hc.Retries {
+						return fmt.Errorf("healthcheck failed after %d retries: exec error: %v", failCount, err)
+					}
+				}
 				continue
 			}
 			if resp.Msg.ExitCode == 0 {
 				return nil
 			}
 			slog.Debug("healthcheck failed (retrying)", "exit_code", resp.Msg.ExitCode)
+			if time.Since(startedAt) >= hc.StartPeriod {
+				failCount++
+				if hc.Retries > 0 && failCount >= hc.Retries {
+					return fmt.Errorf("healthcheck failed after %d retries: exit code %d", failCount, resp.Msg.ExitCode)
+				}
+			}
 		}
 	}
 }
@ -517,3 +554,49 @@ func (s *BuildService) destroySandbox(_ context.Context, agent buildAgentClient,
 		slog.Warn("failed to destroy build sandbox", "sandbox_id", sandboxIDStr, "error", err)
 	}
 }
+
+// fetchSandboxEnv executes the 'env' command inside the specified sandbox via
+// the build agent and returns environment variables
+func (s *BuildService) fetchSandboxEnv(ctx context.Context,
+	agent buildAgentClient, sandboxIDStr string) (map[string]string, error) {
+	resp, err := agent.Exec(ctx, connect.NewRequest(&pb.ExecRequest{
+		SandboxId:  sandboxIDStr,
+		Cmd:        "/bin/sh",
+		Args:       []string{"-c", "env"},
+		TimeoutSec: 10,
+	}))
+	if err != nil {
+		return nil, fmt.Errorf("fetch env: %w", err)
+	}
+
+	if resp.Msg.ExitCode != 0 {
+		return nil, fmt.Errorf("fetch env: command exited with code %d",
+			resp.Msg.ExitCode)
+	}
+
+	return s.parseSandboxEnv(string(resp.Msg.Stdout)), nil
+}
+
+// parseSandboxEnv converts the raw newline-separated output of an 'env'
+// command into a map.
+// It skips empty lines and malformed entries, and correctly handles value
+// containing '='.
+func (s *BuildService) parseSandboxEnv(raw string) map[string]string {
+	envVars := make(map[string]string)
+
+	for line := range strings.SplitSeq(raw, "\n") {
+		line = strings.TrimSpace(line)
+		if line == "" {
+			continue
+		}
+
+		parts := strings.SplitN(line, "=", 2)
+		if len(parts) != 2 {
+			continue
+		}
+
+		envVars[parts[0]] = parts[1]
+	}
+
+	return envVars
+}