package service

import (
	"context"
	"encoding/json"
	"fmt"
	"log/slog"
	"strings"
	"sync"
	"time"

	"connectrpc.com/connect"
	"github.com/jackc/pgx/v5/pgtype"
	"github.com/redis/go-redis/v9"

	"git.omukk.dev/wrenn/wrenn/internal/db"
	"git.omukk.dev/wrenn/wrenn/internal/id"
	"git.omukk.dev/wrenn/wrenn/internal/lifecycle"
	"git.omukk.dev/wrenn/wrenn/internal/recipe"
	"git.omukk.dev/wrenn/wrenn/internal/scheduler"
	pb "git.omukk.dev/wrenn/wrenn/proto/hostagent/gen"
)

const (
	buildQueueKey       = "wrenn:build_queue"
	buildCommandTimeout = 30 * time.Second
)

// preBuildCmds run before the user recipe to prepare the build environment.
// apt update runs as root first, then USER switches to wrenn-user for the recipe.
var preBuildCmds = []string{
	"RUN apt update",
	"USER wrenn-user",
	"WORKDIR /home/wrenn-user",
}

// postBuildCmds run after the user recipe to clean up caches and reduce image size.
var postBuildCmds = []string{
	"RUN apt clean",
	"RUN apt autoremove -y",
	"RUN rm -rf /var/lib/apt/lists/*",
	"RUN rm -rf /tmp/build-files /tmp/build-files.*",
}

// buildAgentClient is the subset of the host agent client used by the build worker.
type buildAgentClient interface {
	CreateSandbox(ctx context.Context, req *connect.Request[pb.CreateSandboxRequest]) (*connect.Response[pb.CreateSandboxResponse], error)
	DestroySandbox(ctx context.Context, req *connect.Request[pb.DestroySandboxRequest]) (*connect.Response[pb.DestroySandboxResponse], error)
	Exec(ctx context.Context, req *connect.Request[pb.ExecRequest]) (*connect.Response[pb.ExecResponse], error)
	WriteFile(ctx context.Context, req *connect.Request[pb.WriteFileRequest]) (*connect.Response[pb.WriteFileResponse], error)
	CreateSnapshot(ctx context.Context, req *connect.Request[pb.CreateSnapshotRequest]) (*connect.Response[pb.CreateSnapshotResponse], error)
	FlattenRootfs(ctx context.Context, req *connect.Request[pb.FlattenRootfsRequest]) (*connect.Response[pb.FlattenRootfsResponse], error)
}

// BuildService handles template build orchestration.
type BuildService struct {
	DB        *db.Queries
	Redis     *redis.Client
	Pool      *lifecycle.HostClientPool
	Scheduler scheduler.HostScheduler

	mu        sync.Mutex
	cancelMap map[string]context.CancelFunc // buildID → per-build cancel func
	filesMap  map[string][]byte             // buildID → uploaded archive bytes
}

// BuildCreateParams holds the parameters for creating a template build.
type BuildCreateParams struct {
	Name         string
	BaseTemplate string
	Recipe       []string
	Healthcheck  string
	VCPUs        int32
	MemoryMB     int32
	SkipPrePost  bool
	Archive      []byte // Optional tar/tar.gz/zip archive for COPY commands.
	ArchiveName  string // Original filename (used to detect format).
}

// storeArchive stores uploaded archive bytes keyed by build ID for the worker.
func (s *BuildService) storeArchive(buildID string, data []byte) {
	s.mu.Lock()
	defer s.mu.Unlock()
	if s.filesMap == nil {
		s.filesMap = make(map[string][]byte)
	}
	s.filesMap[buildID] = data
}

// takeArchive retrieves and removes stored archive bytes for a build.
func (s *BuildService) takeArchive(buildID string) []byte {
	s.mu.Lock()
	defer s.mu.Unlock()
	data := s.filesMap[buildID]
	delete(s.filesMap, buildID)
	return data
}

// Create inserts a new build record and enqueues it to Redis.
func (s *BuildService) Create(ctx context.Context, p BuildCreateParams) (db.TemplateBuild, error) {
	if p.BaseTemplate == "" {
		p.BaseTemplate = "minimal"
	}
	if p.VCPUs <= 0 {
		p.VCPUs = 1
	}
	if p.MemoryMB <= 0 {
		p.MemoryMB = 512
	}

	recipeJSON, err := json.Marshal(p.Recipe)
	if err != nil {
		return db.TemplateBuild{}, fmt.Errorf("marshal recipe: %w", err)
	}

	buildID := id.NewBuildID()
	buildIDStr := id.FormatBuildID(buildID)
	newTemplateID := id.NewTemplateID()

	defaultSteps := len(preBuildCmds) + len(postBuildCmds)
	if p.SkipPrePost {
		defaultSteps = 0
	}

	build, err := s.DB.InsertTemplateBuild(ctx, db.InsertTemplateBuildParams{
		ID:           buildID,
		Name:         p.Name,
		BaseTemplate: p.BaseTemplate,
		Recipe:       recipeJSON,
		Healthcheck:  p.Healthcheck,
		Vcpus:        p.VCPUs,
		MemoryMb:     p.MemoryMB,
		TotalSteps:   int32(len(p.Recipe) + defaultSteps),
		TemplateID:   newTemplateID,
		TeamID:       id.PlatformTeamID,
		SkipPrePost:  p.SkipPrePost,
	})
	if err != nil {
		return db.TemplateBuild{}, fmt.Errorf("insert build: %w", err)
	}

	// Enqueue build ID (as formatted string) to Redis for workers to pick up.
	if err := s.Redis.RPush(ctx, buildQueueKey, buildIDStr).Err(); err != nil {
		return db.TemplateBuild{}, fmt.Errorf("enqueue build: %w", err)
	}

	// Store archive for the worker if provided.
	if len(p.Archive) > 0 {
		s.storeArchive(buildIDStr, p.Archive)
	}

	return build, nil
}

// Get returns a single build by ID.
func (s *BuildService) Get(ctx context.Context, buildID pgtype.UUID) (db.TemplateBuild, error) {
	return s.DB.GetTemplateBuild(ctx, buildID)
}

// List returns all builds ordered by creation time.
func (s *BuildService) List(ctx context.Context) ([]db.TemplateBuild, error) {
	return s.DB.ListTemplateBuilds(ctx)
}

// Cancel cancels a pending or running build. For pending builds the status is
// updated in the DB and the worker skips it when dequeued. For running builds
// the per-build context is cancelled, which causes the current exec step to
// abort; executeBuild then detects the cancellation and records the status.
func (s *BuildService) Cancel(ctx context.Context, buildID pgtype.UUID) error {
	build, err := s.DB.GetTemplateBuild(ctx, buildID)
	if err != nil {
		return fmt.Errorf("get build: %w", err)
	}
	switch build.Status {
	case "success", "failed", "cancelled":
		return fmt.Errorf("build is already %s", build.Status)
	}

	// Mark cancelled in DB first. This handles both pending builds (which haven't
	// been picked up yet) and acts as a flag for executeBuild to check on start.
	if _, err := s.DB.UpdateBuildStatus(ctx, db.UpdateBuildStatusParams{
		ID: buildID, Status: "cancelled",
	}); err != nil {
		return fmt.Errorf("update build status: %w", err)
	}

	// If the build is currently running, signal its context.
	buildIDStr := id.FormatBuildID(buildID)
	s.mu.Lock()
	cancel, running := s.cancelMap[buildIDStr]
	s.mu.Unlock()
	if running {
		cancel()
	}

	return nil
}

// StartWorkers launches n goroutines that consume from the Redis build queue.
// The returned cancel function stops all workers.
func (s *BuildService) StartWorkers(ctx context.Context, n int) context.CancelFunc {
	ctx, cancel := context.WithCancel(ctx)
	for i := range n {
		go s.worker(ctx, i)
	}
	slog.Info("build workers started", "count", n)
	return cancel
}

func (s *BuildService) worker(ctx context.Context, workerID int) {
	log := slog.With("worker", workerID)
	for {
		// BLPOP blocks until a build ID is available or context is cancelled.
		result, err := s.Redis.BLPop(ctx, 0, buildQueueKey).Result()
		if err != nil {
			if ctx.Err() != nil {
				log.Info("build worker shutting down")
				return
			}
			log.Error("redis BLPOP error", "error", err)
			time.Sleep(time.Second)
			continue
		}
		// result[0] is the key, result[1] is the build ID (formatted string).
		buildIDStr := result[1]
		log.Info("picked up build", "build_id", buildIDStr)
		s.executeBuild(ctx, buildIDStr)
	}
}

func (s *BuildService) executeBuild(ctx context.Context, buildIDStr string) {
	log := slog.With("build_id", buildIDStr)

	buildID, err := id.ParseBuildID(buildIDStr)
	if err != nil {
		log.Error("invalid build ID from queue", "error", err)
		return
	}

	// Create a per-build context so this build can be cancelled independently of
	// the worker. Register in cancelMap before fetching the build so that a
	// concurrent Cancel call can always find and signal it.
	buildCtx, buildCancel := context.WithCancel(ctx)
	defer buildCancel()

	s.mu.Lock()
	if s.cancelMap == nil {
		s.cancelMap = make(map[string]context.CancelFunc)
	}
	s.cancelMap[buildIDStr] = buildCancel
	s.mu.Unlock()
	defer func() {
		s.mu.Lock()
		delete(s.cancelMap, buildIDStr)
		s.mu.Unlock()
	}()

	build, err := s.DB.GetTemplateBuild(buildCtx, buildID)
	if err != nil {
		log.Error("failed to fetch build", "error", err)
		return
	}

	// Skip if already cancelled (Cancel was called before we dequeued).
	if build.Status == "cancelled" {
		log.Info("build already cancelled, skipping")
		return
	}

	// Mark as running.
	if _, err := s.DB.UpdateBuildStatus(buildCtx, db.UpdateBuildStatusParams{
		ID: buildID, Status: "running",
	}); err != nil {
		log.Error("failed to update build status", "error", err)
		return
	}

	// Parse user recipe.
	var userRecipe []string
	if err := json.Unmarshal(build.Recipe, &userRecipe); err != nil {
		s.failBuild(buildCtx, buildID, fmt.Sprintf("invalid recipe JSON: %v", err))
		return
	}

	// Pick a platform host and create a sandbox.
	host, err := s.Scheduler.SelectHost(buildCtx, id.PlatformTeamID, false, build.MemoryMb, 5120)
	if err != nil {
		s.failBuild(buildCtx, buildID, fmt.Sprintf("no host available: %v", err))
		return
	}

	agent, err := s.Pool.GetForHost(host)
	if err != nil {
		s.failBuild(buildCtx, buildID, fmt.Sprintf("agent client error: %v", err))
		return
	}

	sandboxID := id.NewSandboxID()
	sandboxIDStr := id.FormatSandboxID(sandboxID)
	log = log.With("sandbox_id", sandboxIDStr, "host_id", id.FormatHostID(host.ID))

	// Resolve the base template to UUIDs. "minimal" is the zero sentinel.
	baseTeamID := id.PlatformTeamID
	baseTemplateID := id.MinimalTemplateID
	if build.BaseTemplate != "minimal" {
		baseTmpl, err := s.DB.GetPlatformTemplateByName(buildCtx, build.BaseTemplate)
		if err != nil {
			s.failBuild(buildCtx, buildID, fmt.Sprintf("base template %q not found: %v", build.BaseTemplate, err))
			return
		}
		baseTeamID = baseTmpl.TeamID
		baseTemplateID = baseTmpl.ID
	}

	resp, err := agent.CreateSandbox(buildCtx, connect.NewRequest(&pb.CreateSandboxRequest{
		SandboxId:  sandboxIDStr,
		Template:   build.BaseTemplate,
		TeamId:     id.UUIDString(baseTeamID),
		TemplateId: id.UUIDString(baseTemplateID),
		Vcpus:      build.Vcpus,
		MemoryMb:   build.MemoryMb,
		TimeoutSec: 0,    // no auto-pause for builds
		DiskSizeMb: 5120, // 5 GB for template builds
	}))
	if err != nil {
		s.failBuild(buildCtx, buildID, fmt.Sprintf("create sandbox failed: %v", err))
		return
	}
	_ = resp

	// Record sandbox/host association.
	_ = s.DB.UpdateBuildSandbox(buildCtx, db.UpdateBuildSandboxParams{
		ID:        buildID,
		SandboxID: sandboxID,
		HostID:    host.ID,
	})

	// Upload and extract build archive if provided.
	archive := s.takeArchive(buildIDStr)
	if len(archive) > 0 {
		if err := s.uploadAndExtractArchive(buildCtx, agent, sandboxIDStr, archive, buildIDStr); err != nil {
			s.destroySandbox(buildCtx, agent, sandboxIDStr)
			s.failBuild(buildCtx, buildID, fmt.Sprintf("archive upload failed: %v", err))
			return
		}
	}

	// Parse recipe steps. preBuildCmds and postBuildCmds are hardcoded and always
	// valid; panic on error is appropriate here since it would be a programmer mistake.
	preBuildSteps, err := recipe.ParseRecipe(preBuildCmds)
	if err != nil {
		panic(fmt.Sprintf("invalid pre-build recipe: %v", err))
	}
	userRecipeSteps, err := recipe.ParseRecipe(userRecipe)
	if err != nil {
		s.destroySandbox(buildCtx, agent, sandboxIDStr)
		s.failBuild(buildCtx, buildID, fmt.Sprintf("recipe parse error: %v", err))
		return
	}
	postBuildSteps, err := recipe.ParseRecipe(postBuildCmds)
	if err != nil {
		panic(fmt.Sprintf("invalid post-build recipe: %v", err))
	}

	var logs []recipe.BuildLogEntry
	step := 0

	envVars, err := s.fetchSandboxEnv(buildCtx, agent, sandboxIDStr)
	if err != nil {
		log.Warn("failed to fetch sandbox env, using defaults", "error", err)
		envVars = map[string]string{
			"PATH": "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
			"HOME": "/root",
		}
	}
	bctx := &recipe.ExecContext{EnvVars: envVars, User: "root"}

	// Per-step progress callback for live UI updates.
	progressFn := func(currentStep int, allEntries []recipe.BuildLogEntry) {
		s.updateLogs(buildCtx, buildID, currentStep, allEntries)
	}

	runPhase := func(phase string, steps []recipe.Step, defaultTimeout time.Duration) bool {
		newEntries, nextStep, ok := recipe.Execute(buildCtx, phase, steps, sandboxIDStr, step, defaultTimeout, bctx, agent.Exec, func(currentStep int, phaseEntries []recipe.BuildLogEntry) {
			// Progress callback: combine prior logs with current phase entries.
			progressFn(currentStep, append(logs, phaseEntries...))
		})
		logs = append(logs, newEntries...)
		step = nextStep
		s.updateLogs(buildCtx, buildID, step, logs)
		if !ok {
			s.destroySandbox(buildCtx, agent, sandboxIDStr)
			// If the build was cancelled, status is already set — don't overwrite with "failed".
			if buildCtx.Err() != nil {
				return false
			}
			reason := "unknown error"
			if len(newEntries) > 0 {
				last := newEntries[len(newEntries)-1]
				reason = last.Stderr
				if reason == "" {
					reason = fmt.Sprintf("exit code %d", last.Exit)
				}
			}
			s.failBuild(buildCtx, buildID, fmt.Sprintf("%s step %d failed: %s", phase, step, reason))
		}
		return ok
	}

	// Phase 1: Pre-build (as root) — creates wrenn-user, updates apt.
	if !build.SkipPrePost {
		if !runPhase("pre-build", preBuildSteps, 0) {
			return
		}
	}

	// Phase 2: User recipe — starts as wrenn-user (set by USER in pre-build)
	// or root if skip_pre_post.
	if !runPhase("recipe", userRecipeSteps, buildCommandTimeout) {
		return
	}

	// Capture the final user and env vars as template defaults.
	// Filter out user-specific and runtime vars that should be resolved at
	// sandbox creation time, not baked in from the build environment.
	templateDefaultUser := bctx.User
	templateDefaultEnv := filterBuildEnv(bctx.EnvVars)

	// Phase 3: Post-build (as root) — cleanup.
	bctx.User = "root"
	if !build.SkipPrePost {
		if !runPhase("post-build", postBuildSteps, 0) {
			return
		}
	}

	// Healthcheck or direct snapshot.
	var sizeBytes int64
	if build.Healthcheck != "" {
		hc, err := recipe.ParseHealthcheck(build.Healthcheck)
		if err != nil {
			s.destroySandbox(buildCtx, agent, sandboxIDStr)
			s.failBuild(buildCtx, buildID, fmt.Sprintf("invalid healthcheck: %v", err))
			return
		}
		log.Info("running healthcheck", "cmd", hc.Cmd, "interval", hc.Interval, "timeout", hc.Timeout, "start_period", hc.StartPeriod, "retries", hc.Retries)
		if err := s.waitForHealthcheck(buildCtx, agent, sandboxIDStr, hc); err != nil {
			s.destroySandbox(buildCtx, agent, sandboxIDStr)
			if buildCtx.Err() != nil {
				return
			}
			s.failBuild(buildCtx, buildID, fmt.Sprintf("healthcheck failed: %v", err))
			return
		}

		// Healthcheck passed → full snapshot (with memory/CPU state).
		log.Info("healthcheck passed, creating snapshot")
		snapResp, err := agent.CreateSnapshot(buildCtx, connect.NewRequest(&pb.CreateSnapshotRequest{
			SandboxId:  sandboxIDStr,
			Name:       build.Name,
			TeamId:     id.UUIDString(build.TeamID),
			TemplateId: id.UUIDString(build.TemplateID),
		}))
		if err != nil {
			s.destroySandbox(buildCtx, agent, sandboxIDStr)
			if buildCtx.Err() != nil {
				return
			}
			s.failBuild(buildCtx, buildID, fmt.Sprintf("create snapshot failed: %v", err))
			return
		}
		sizeBytes = snapResp.Msg.SizeBytes
	} else {
		// No healthcheck → image-only template (rootfs only).
		log.Info("no healthcheck, flattening rootfs")
		flatResp, err := agent.FlattenRootfs(buildCtx, connect.NewRequest(&pb.FlattenRootfsRequest{
			SandboxId:  sandboxIDStr,
			Name:       build.Name,
			TeamId:     id.UUIDString(build.TeamID),
			TemplateId: id.UUIDString(build.TemplateID),
		}))
		if err != nil {
			s.destroySandbox(buildCtx, agent, sandboxIDStr)
			if buildCtx.Err() != nil {
				return
			}
			s.failBuild(buildCtx, buildID, fmt.Sprintf("flatten rootfs failed: %v", err))
			return
		}
		sizeBytes = flatResp.Msg.SizeBytes
	}

	// Insert into templates table as a global (platform) template.
	templateType := "base"
	if build.Healthcheck != "" {
		templateType = "snapshot"
	}

	// Serialize env vars for DB storage.
	defaultEnvJSON, err := json.Marshal(templateDefaultEnv)
	if err != nil {
		defaultEnvJSON = []byte("{}")
	}

	if _, err := s.DB.InsertTemplate(buildCtx, db.InsertTemplateParams{
		ID:          build.TemplateID,
		Name:        build.Name,
		Type:        templateType,
		Vcpus:       build.Vcpus,
		MemoryMb:    build.MemoryMb,
		SizeBytes:   sizeBytes,
		TeamID:      id.PlatformTeamID,
		DefaultUser: templateDefaultUser,
		DefaultEnv:  defaultEnvJSON,
	}); err != nil {
		log.Error("failed to insert template record", "error", err)
		// Build succeeded on disk, just DB record failed — don't mark as failed.
	}

	// Record defaults on the build record for inspection.
	_ = s.DB.UpdateBuildDefaults(buildCtx, db.UpdateBuildDefaultsParams{
		ID:          buildID,
		DefaultUser: templateDefaultUser,
		DefaultEnv:  defaultEnvJSON,
	})

	// For CreateSnapshot, the sandbox is already destroyed by the snapshot process.
	// For FlattenRootfs, the sandbox is already destroyed by the flatten process.
	// No additional destroy needed.

	// Mark build as success.
	if _, err := s.DB.UpdateBuildStatus(buildCtx, db.UpdateBuildStatusParams{
		ID: buildID, Status: "success",
	}); err != nil {
		log.Error("failed to mark build as success", "error", err)
	}

	log.Info("template build completed successfully", "name", build.Name)
}

// waitForHealthcheck repeatedly executes the healthcheck command inside the
// sandbox according to the config's interval, timeout, start-period, and
// retries.
// During the start period, failures are not counted toward the retry budget.
// Returns nil on the first successful check, or an error if retries are
// exhausted, the deadline passes, or the context is cancelled.
func (s *BuildService) waitForHealthcheck(ctx context.Context, agent buildAgentClient, sandboxIDStr string, hc recipe.HealthcheckConfig) error {
	ticker := time.NewTicker(hc.Interval)
	defer ticker.Stop()

	// When retries > 0, set a deadline based on the retry budget.
	// When retries == 0 (unlimited), rely solely on the parent context deadline.
	var deadlineCh <-chan time.Time
	if hc.Retries > 0 {
		deadline := time.NewTimer(hc.StartPeriod + time.Duration(hc.Retries+1)*hc.Interval)
		defer deadline.Stop()
		deadlineCh = deadline.C
	}

	startedAt := time.Now()
	failCount := 0

	for {
		select {
		case <-ctx.Done():
			return ctx.Err()
		case <-deadlineCh:
			return fmt.Errorf("healthcheck timed out: exceeded %d attempts over %s", failCount, time.Since(startedAt))
		case <-ticker.C:
			execCtx, cancel := context.WithTimeout(ctx, hc.Timeout)
			resp, err := agent.Exec(execCtx, connect.NewRequest(&pb.ExecRequest{
				SandboxId:  sandboxIDStr,
				Cmd:        "/bin/sh",
				Args:       []string{"-c", hc.Cmd},
				TimeoutSec: int32(hc.Timeout.Seconds()),
			}))
			cancel()

			if err != nil {
				slog.Debug("healthcheck exec error (retrying)", "error", err)
				if time.Since(startedAt) >= hc.StartPeriod {
					failCount++
					if hc.Retries > 0 && failCount >= hc.Retries {
						return fmt.Errorf("healthcheck failed after %d retries: exec error: %w", failCount, err)
					}
				}
				continue
			}
			if resp.Msg.ExitCode == 0 {
				return nil
			}
			slog.Debug("healthcheck failed (retrying)", "exit_code", resp.Msg.ExitCode)
			if time.Since(startedAt) >= hc.StartPeriod {
				failCount++
				if hc.Retries > 0 && failCount >= hc.Retries {
					return fmt.Errorf("healthcheck failed after %d retries: exit code %d", failCount, resp.Msg.ExitCode)
				}
			}
		}
	}
}

func (s *BuildService) updateLogs(ctx context.Context, buildID pgtype.UUID, step int, logs []recipe.BuildLogEntry) {
	logsJSON, err := json.Marshal(logs)
	if err != nil {
		slog.Warn("failed to marshal build logs", "error", err)
		return
	}
	if err := s.DB.UpdateBuildProgress(ctx, db.UpdateBuildProgressParams{
		ID:          buildID,
		CurrentStep: int32(step),
		Logs:        logsJSON,
	}); err != nil {
		slog.Warn("failed to update build progress", "error", err)
	}
}

func (s *BuildService) failBuild(_ context.Context, buildID pgtype.UUID, errMsg string) {
	slog.Error("build failed", "build_id", id.FormatBuildID(buildID), "error", errMsg)
	// Use a detached context so DB writes survive parent context cancellation (e.g. shutdown).
	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
	defer cancel()
	if err := s.DB.UpdateBuildError(ctx, db.UpdateBuildErrorParams{
		ID:    buildID,
		Error: errMsg,
	}); err != nil {
		slog.Error("failed to update build error", "build_id", id.FormatBuildID(buildID), "error", err)
	}
}

func (s *BuildService) destroySandbox(_ context.Context, agent buildAgentClient, sandboxIDStr string) {
	// Use a detached context so cleanup succeeds even during shutdown.
	ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
	defer cancel()
	if _, err := agent.DestroySandbox(ctx, connect.NewRequest(&pb.DestroySandboxRequest{
		SandboxId: sandboxIDStr,
	})); err != nil {
		slog.Warn("failed to destroy build sandbox", "sandbox_id", sandboxIDStr, "error", err)
	}
}

// fetchSandboxEnv executes the 'env' command inside the specified sandbox via
// the build agent and returns environment variables
func (s *BuildService) fetchSandboxEnv(ctx context.Context,
	agent buildAgentClient, sandboxIDStr string) (map[string]string, error) {
	resp, err := agent.Exec(ctx, connect.NewRequest(&pb.ExecRequest{
		SandboxId:  sandboxIDStr,
		Cmd:        "/bin/sh",
		Args:       []string{"-c", "env"},
		TimeoutSec: 10,
	}))
	if err != nil {
		return nil, fmt.Errorf("fetch env: %w", err)
	}

	if resp.Msg.ExitCode != 0 {
		return nil, fmt.Errorf("fetch env: command exited with code %d",
			resp.Msg.ExitCode)
	}

	return parseSandboxEnv(string(resp.Msg.Stdout)), nil
}

// parseSandboxEnv converts the raw newline-separated output of an 'env'
// command into a map.
// It skips empty lines and malformed entries, and correctly handles values
// containing '='.
func parseSandboxEnv(raw string) map[string]string {
	envVars := make(map[string]string)

	for line := range strings.SplitSeq(raw, "\n") {
		line = strings.TrimSpace(line)
		if line == "" {
			continue
		}

		parts := strings.SplitN(line, "=", 2)
		if len(parts) != 2 {
			continue
		}

		envVars[parts[0]] = parts[1]
	}

	return envVars
}

// uploadAndExtractArchive writes the archive to the sandbox and extracts it
// to /tmp/build-files/. Detects format from content (tar.gz, tar, zip).
func (s *BuildService) uploadAndExtractArchive(
	ctx context.Context,
	agent buildAgentClient,
	sandboxID string,
	archive []byte,
	buildID string,
) error {
	// Detect archive type from magic bytes.
	var archivePath, extractCmd string
	switch {
	case len(archive) >= 2 && archive[0] == 0x1f && archive[1] == 0x8b:
		// gzip (tar.gz)
		archivePath = "/tmp/build-files.tar.gz"
		extractCmd = "mkdir -p /tmp/build-files && tar xzf /tmp/build-files.tar.gz -C /tmp/build-files"
	case len(archive) >= 4 && string(archive[:4]) == "PK\x03\x04":
		// zip
		archivePath = "/tmp/build-files.zip"
		extractCmd = "mkdir -p /tmp/build-files && unzip -o /tmp/build-files.zip -d /tmp/build-files"
	case len(archive) >= 262 && string(archive[257:262]) == "ustar":
		// tar (ustar magic at offset 257)
		archivePath = "/tmp/build-files.tar"
		extractCmd = "mkdir -p /tmp/build-files && tar xf /tmp/build-files.tar -C /tmp/build-files"
	default:
		// Fallback: try tar.gz
		archivePath = "/tmp/build-files.tar.gz"
		extractCmd = "mkdir -p /tmp/build-files && tar xzf /tmp/build-files.tar.gz -C /tmp/build-files"
	}

	slog.Info("uploading build archive", "build_id", buildID, "path", archivePath, "size", len(archive))

	// Write archive to VM.
	if _, err := agent.WriteFile(ctx, connect.NewRequest(&pb.WriteFileRequest{
		SandboxId: sandboxID,
		Path:      archivePath,
		Content:   archive,
	})); err != nil {
		return fmt.Errorf("write archive: %w", err)
	}

	// Extract and ensure files are readable.
	fullCmd := extractCmd + " && chmod -R a+rX /tmp/build-files"

	resp, err := agent.Exec(ctx, connect.NewRequest(&pb.ExecRequest{
		SandboxId:  sandboxID,
		Cmd:        "/bin/sh",
		Args:       []string{"-c", fullCmd},
		TimeoutSec: 120,
	}))
	if err != nil {
		return fmt.Errorf("extract archive: %w", err)
	}
	if resp.Msg.ExitCode != 0 {
		return fmt.Errorf("extract archive: exit code %d: %s", resp.Msg.ExitCode, string(resp.Msg.Stderr))
	}

	return nil
}

// runtimeEnvVars lists env vars that are user- or session-specific and should
// not be persisted into template defaults. These are resolved at runtime by
// envd based on the actual user and sandbox context.
var runtimeEnvVars = map[string]bool{
	"HOME": true, "USER": true, "LOGNAME": true, "SHELL": true,
	"PWD": true, "OLDPWD": true, "HOSTNAME": true, "TERM": true,
	"SHLVL": true, "_": true,
	// Per-sandbox identifiers set by envd at boot via MMDS.
	"WRENN_SANDBOX_ID": true, "WRENN_TEMPLATE_ID": true,
}

// filterBuildEnv returns a copy of envVars with runtime/user-specific
// variables removed so they don't override envd's per-user resolution.
func filterBuildEnv(envVars map[string]string) map[string]string {
	filtered := make(map[string]string, len(envVars))
	for k, v := range envVars {
		if runtimeEnvVars[k] {
			continue
		}
		filtered[k] = v
	}
	return filtered
}