forked from wrenn/wrenn
refactor: polish control plane and host agent code
- Decompose executeBuild (318 lines) into provisionBuildSandbox and finalizeBuild helpers for readability - Extract cleanupPauseFailure in sandbox manager to unify 3 inconsistent inline teardown paths (also fixes CoW file leak on rename failure) - Remove unused ctx parameter from startProcess/startProcessForRestore - Add missing MASQUERADE rollback entry in CreateNetwork for symmetry - Consolidate duplicate writeJSON for UTF-8/base64 exec response
This commit is contained in:
@ -130,30 +130,22 @@ func (h *execHandler) Exec(w http.ResponseWriter, r *http.Request) {
|
|||||||
|
|
||||||
updateLastActive(h.db, sandboxID, sandboxIDStr)
|
updateLastActive(h.db, sandboxID, sandboxIDStr)
|
||||||
|
|
||||||
// Use base64 encoding if output contains non-UTF-8 bytes.
|
|
||||||
stdout := resp.Msg.Stdout
|
stdout := resp.Msg.Stdout
|
||||||
stderr := resp.Msg.Stderr
|
stderr := resp.Msg.Stderr
|
||||||
encoding := "utf-8"
|
|
||||||
|
|
||||||
|
encoding := "utf-8"
|
||||||
|
stdoutStr, stderrStr := string(stdout), string(stderr)
|
||||||
if !utf8.Valid(stdout) || !utf8.Valid(stderr) {
|
if !utf8.Valid(stdout) || !utf8.Valid(stderr) {
|
||||||
encoding = "base64"
|
encoding = "base64"
|
||||||
writeJSON(w, http.StatusOK, execResponse{
|
stdoutStr = base64.StdEncoding.EncodeToString(stdout)
|
||||||
SandboxID: sandboxIDStr,
|
stderrStr = base64.StdEncoding.EncodeToString(stderr)
|
||||||
Cmd: req.Cmd,
|
|
||||||
Stdout: base64.StdEncoding.EncodeToString(stdout),
|
|
||||||
Stderr: base64.StdEncoding.EncodeToString(stderr),
|
|
||||||
ExitCode: resp.Msg.ExitCode,
|
|
||||||
DurationMs: duration.Milliseconds(),
|
|
||||||
Encoding: encoding,
|
|
||||||
})
|
|
||||||
return
|
|
||||||
}
|
}
|
||||||
|
|
||||||
writeJSON(w, http.StatusOK, execResponse{
|
writeJSON(w, http.StatusOK, execResponse{
|
||||||
SandboxID: sandboxIDStr,
|
SandboxID: sandboxIDStr,
|
||||||
Cmd: req.Cmd,
|
Cmd: req.Cmd,
|
||||||
Stdout: string(stdout),
|
Stdout: stdoutStr,
|
||||||
Stderr: string(stderr),
|
Stderr: stderrStr,
|
||||||
ExitCode: resp.Msg.ExitCode,
|
ExitCode: resp.Msg.ExitCode,
|
||||||
DurationMs: duration.Milliseconds(),
|
DurationMs: duration.Milliseconds(),
|
||||||
Encoding: encoding,
|
Encoding: encoding,
|
||||||
|
|||||||
@ -430,6 +430,9 @@ func CreateNetwork(slot *Slot) error {
|
|||||||
rollback()
|
rollback()
|
||||||
return fmt.Errorf("add masquerade rule: %w", err)
|
return fmt.Errorf("add masquerade rule: %w", err)
|
||||||
}
|
}
|
||||||
|
rollbacks = append(rollbacks, func() {
|
||||||
|
_ = iptablesHost("-t", "nat", "-D", "POSTROUTING", "-s", fmt.Sprintf("%s/32", slot.VpeerIP.String()), "-o", defaultIface, "-j", "MASQUERADE")
|
||||||
|
})
|
||||||
|
|
||||||
slog.Info("network created",
|
slog.Info("network created",
|
||||||
"ns", slot.NamespaceID,
|
"ns", slot.NamespaceID,
|
||||||
|
|||||||
@ -359,6 +359,25 @@ func (m *Manager) cleanup(ctx context.Context, sb *sandboxState) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// cleanupPauseFailure is best-effort teardown when a pause operation fails
|
||||||
|
// after the VM has already been destroyed. It releases all resources and removes
|
||||||
|
// the sandbox from the in-memory map.
|
||||||
|
func (m *Manager) cleanupPauseFailure(sb *sandboxState, sandboxID string, pauseDir string) {
|
||||||
|
warnErr("snapshot dir cleanup error", sandboxID, os.RemoveAll(pauseDir))
|
||||||
|
warnErr("network cleanup error during pause", sandboxID, network.RemoveNetwork(sb.slot))
|
||||||
|
m.slots.Release(sb.SlotIndex)
|
||||||
|
if sb.dmDevice != nil {
|
||||||
|
warnErr("dm-snapshot remove error during pause", sandboxID, devicemapper.RemoveSnapshot(context.Background(), sb.dmDevice))
|
||||||
|
os.Remove(sb.dmDevice.CowPath)
|
||||||
|
}
|
||||||
|
if sb.baseImagePath != "" {
|
||||||
|
m.loops.Release(sb.baseImagePath)
|
||||||
|
}
|
||||||
|
m.mu.Lock()
|
||||||
|
delete(m.boxes, sandboxID)
|
||||||
|
m.mu.Unlock()
|
||||||
|
}
|
||||||
|
|
||||||
// Pause takes a snapshot of a running sandbox, then destroys all resources.
|
// Pause takes a snapshot of a running sandbox, then destroys all resources.
|
||||||
// The sandbox's snapshot files are stored at SnapshotsDir/{sandboxID}/.
|
// The sandbox's snapshot files are stored at SnapshotsDir/{sandboxID}/.
|
||||||
// After this call, the sandbox is no longer running but can be resumed.
|
// After this call, the sandbox is no longer running but can be resumed.
|
||||||
@ -513,45 +532,21 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error {
|
|||||||
slog.Warn("pause: failed to remove old snapshot dir", "id", sandboxID, "error", err)
|
slog.Warn("pause: failed to remove old snapshot dir", "id", sandboxID, "error", err)
|
||||||
}
|
}
|
||||||
if err := os.Rename(tmpPauseDir, pauseDir); err != nil {
|
if err := os.Rename(tmpPauseDir, pauseDir); err != nil {
|
||||||
warnErr("network cleanup error during pause", sandboxID, network.RemoveNetwork(sb.slot))
|
m.cleanupPauseFailure(sb, sandboxID, pauseDir)
|
||||||
m.slots.Release(sb.SlotIndex)
|
|
||||||
if sb.dmDevice != nil {
|
|
||||||
warnErr("dm-snapshot remove error during pause", sandboxID, devicemapper.RemoveSnapshot(context.Background(), sb.dmDevice))
|
|
||||||
os.Remove(sb.dmDevice.CowPath)
|
|
||||||
}
|
|
||||||
if sb.baseImagePath != "" {
|
|
||||||
m.loops.Release(sb.baseImagePath)
|
|
||||||
}
|
|
||||||
m.mu.Lock()
|
|
||||||
delete(m.boxes, sandboxID)
|
|
||||||
m.mu.Unlock()
|
|
||||||
return fmt.Errorf("rename snapshot dir: %w", err)
|
return fmt.Errorf("rename snapshot dir: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Step 7: Remove dm-snapshot and save CoW ──────────────────────
|
// ── Step 7: Remove dm-snapshot and save CoW ──────────────────────
|
||||||
if sb.dmDevice != nil {
|
if sb.dmDevice != nil {
|
||||||
if err := devicemapper.RemoveSnapshot(ctx, sb.dmDevice); err != nil {
|
if err := devicemapper.RemoveSnapshot(ctx, sb.dmDevice); err != nil {
|
||||||
warnErr("network cleanup error during pause", sandboxID, network.RemoveNetwork(sb.slot))
|
m.cleanupPauseFailure(sb, sandboxID, pauseDir)
|
||||||
m.slots.Release(sb.SlotIndex)
|
|
||||||
warnErr("snapshot dir cleanup error", sandboxID, os.RemoveAll(pauseDir))
|
|
||||||
m.mu.Lock()
|
|
||||||
delete(m.boxes, sandboxID)
|
|
||||||
m.mu.Unlock()
|
|
||||||
return fmt.Errorf("remove dm-snapshot: %w", err)
|
return fmt.Errorf("remove dm-snapshot: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
snapshotCow := snapshot.CowPath(pauseDir, "")
|
snapshotCow := snapshot.CowPath(pauseDir, "")
|
||||||
if err := os.Rename(sb.dmDevice.CowPath, snapshotCow); err != nil {
|
if err := os.Rename(sb.dmDevice.CowPath, snapshotCow); err != nil {
|
||||||
warnErr("snapshot dir cleanup error", sandboxID, os.RemoveAll(pauseDir))
|
|
||||||
warnErr("network cleanup error during pause", sandboxID, network.RemoveNetwork(sb.slot))
|
|
||||||
m.slots.Release(sb.SlotIndex)
|
|
||||||
os.Remove(sb.dmDevice.CowPath)
|
os.Remove(sb.dmDevice.CowPath)
|
||||||
if sb.baseImagePath != "" {
|
m.cleanupPauseFailure(sb, sandboxID, pauseDir)
|
||||||
m.loops.Release(sb.baseImagePath)
|
|
||||||
}
|
|
||||||
m.mu.Lock()
|
|
||||||
delete(m.boxes, sandboxID)
|
|
||||||
m.mu.Unlock()
|
|
||||||
return fmt.Errorf("move cow file: %w", err)
|
return fmt.Errorf("move cow file: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -561,15 +556,7 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error {
|
|||||||
VCPUs: sb.VCPUs,
|
VCPUs: sb.VCPUs,
|
||||||
MemoryMB: sb.MemoryMB,
|
MemoryMB: sb.MemoryMB,
|
||||||
}); err != nil {
|
}); err != nil {
|
||||||
warnErr("snapshot dir cleanup error", sandboxID, os.RemoveAll(pauseDir))
|
m.cleanupPauseFailure(sb, sandboxID, pauseDir)
|
||||||
warnErr("network cleanup error during pause", sandboxID, network.RemoveNetwork(sb.slot))
|
|
||||||
m.slots.Release(sb.SlotIndex)
|
|
||||||
if sb.baseImagePath != "" {
|
|
||||||
m.loops.Release(sb.baseImagePath)
|
|
||||||
}
|
|
||||||
m.mu.Lock()
|
|
||||||
delete(m.boxes, sandboxID)
|
|
||||||
m.mu.Unlock()
|
|
||||||
return fmt.Errorf("write rootfs meta: %w", err)
|
return fmt.Errorf("write rootfs meta: %w", err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -47,7 +47,7 @@ func (m *Manager) Create(ctx context.Context, cfg VMConfig) (*VM, error) {
|
|||||||
)
|
)
|
||||||
|
|
||||||
// Step 1: Launch the Cloud Hypervisor process.
|
// Step 1: Launch the Cloud Hypervisor process.
|
||||||
proc, err := startProcess(ctx, &cfg)
|
proc, err := startProcess(&cfg)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("start process: %w", err)
|
return nil, fmt.Errorf("start process: %w", err)
|
||||||
}
|
}
|
||||||
@ -220,7 +220,7 @@ func (m *Manager) CreateFromSnapshot(ctx context.Context, cfg VMConfig, snapshot
|
|||||||
)
|
)
|
||||||
|
|
||||||
// Step 1: Launch bare CH process (no --restore).
|
// Step 1: Launch bare CH process (no --restore).
|
||||||
proc, err := startProcessForRestore(ctx, &cfg)
|
proc, err := startProcessForRestore(&cfg)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("start process: %w", err)
|
return nil, fmt.Errorf("start process: %w", err)
|
||||||
}
|
}
|
||||||
|
|||||||
@ -30,7 +30,7 @@ type process struct {
|
|||||||
// 4. symlink kernel and rootfs into SandboxDir
|
// 4. symlink kernel and rootfs into SandboxDir
|
||||||
// 5. ip netns exec <ns>: enters the network namespace where TAP is configured
|
// 5. ip netns exec <ns>: enters the network namespace where TAP is configured
|
||||||
// 6. exec cloud-hypervisor with the API socket path
|
// 6. exec cloud-hypervisor with the API socket path
|
||||||
func startProcess(ctx context.Context, cfg *VMConfig) (*process, error) {
|
func startProcess(cfg *VMConfig) (*process, error) {
|
||||||
script := buildStartScript(cfg)
|
script := buildStartScript(cfg)
|
||||||
return launchScript(script, cfg)
|
return launchScript(script, cfg)
|
||||||
}
|
}
|
||||||
@ -38,7 +38,7 @@ func startProcess(ctx context.Context, cfg *VMConfig) (*process, error) {
|
|||||||
// startProcessForRestore launches a bare Cloud Hypervisor process (no --restore).
|
// startProcessForRestore launches a bare Cloud Hypervisor process (no --restore).
|
||||||
// The restore is performed via the API after the socket is ready, which allows
|
// The restore is performed via the API after the socket is ready, which allows
|
||||||
// passing memory_restore_mode=OnDemand for UFFD lazy paging.
|
// passing memory_restore_mode=OnDemand for UFFD lazy paging.
|
||||||
func startProcessForRestore(ctx context.Context, cfg *VMConfig) (*process, error) {
|
func startProcessForRestore(cfg *VMConfig) (*process, error) {
|
||||||
script := buildRestoreScript(cfg)
|
script := buildRestoreScript(cfg)
|
||||||
return launchScript(script, cfg)
|
return launchScript(script, cfg)
|
||||||
}
|
}
|
||||||
|
|||||||
@ -282,69 +282,11 @@ func (s *BuildService) executeBuild(ctx context.Context, buildIDStr string) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// Pick a platform host and create a sandbox.
|
agent, sandboxIDStr, sandboxMetadata, err := s.provisionBuildSandbox(buildCtx, buildID, buildIDStr, build, log)
|
||||||
host, err := s.Scheduler.SelectHost(buildCtx, id.PlatformTeamID, false, build.MemoryMb, 5120)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
s.failBuild(buildCtx, buildID, fmt.Sprintf("no host available: %v", err))
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
log = log.With("sandbox_id", sandboxIDStr)
|
||||||
agent, err := s.Pool.GetForHost(host)
|
|
||||||
if err != nil {
|
|
||||||
s.failBuild(buildCtx, buildID, fmt.Sprintf("agent client error: %v", err))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
sandboxID := id.NewSandboxID()
|
|
||||||
sandboxIDStr := id.FormatSandboxID(sandboxID)
|
|
||||||
log = log.With("sandbox_id", sandboxIDStr, "host_id", id.FormatHostID(host.ID))
|
|
||||||
|
|
||||||
// Resolve the base template to UUIDs. "minimal" is the zero sentinel.
|
|
||||||
baseTeamID := id.PlatformTeamID
|
|
||||||
baseTemplateID := id.MinimalTemplateID
|
|
||||||
if build.BaseTemplate != "minimal" {
|
|
||||||
baseTmpl, err := s.DB.GetPlatformTemplateByName(buildCtx, build.BaseTemplate)
|
|
||||||
if err != nil {
|
|
||||||
s.failBuild(buildCtx, buildID, fmt.Sprintf("base template %q not found: %v", build.BaseTemplate, err))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
baseTeamID = baseTmpl.TeamID
|
|
||||||
baseTemplateID = baseTmpl.ID
|
|
||||||
}
|
|
||||||
|
|
||||||
resp, err := agent.CreateSandbox(buildCtx, connect.NewRequest(&pb.CreateSandboxRequest{
|
|
||||||
SandboxId: sandboxIDStr,
|
|
||||||
Template: build.BaseTemplate,
|
|
||||||
TeamId: id.UUIDString(baseTeamID),
|
|
||||||
TemplateId: id.UUIDString(baseTemplateID),
|
|
||||||
Vcpus: build.Vcpus,
|
|
||||||
MemoryMb: build.MemoryMb,
|
|
||||||
TimeoutSec: 0, // no auto-pause for builds
|
|
||||||
DiskSizeMb: 5120, // 5 GB for template builds
|
|
||||||
}))
|
|
||||||
if err != nil {
|
|
||||||
s.failBuild(buildCtx, buildID, fmt.Sprintf("create sandbox failed: %v", err))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
// Capture sandbox metadata (envd/kernel/vmm/agent versions).
|
|
||||||
sandboxMetadata := resp.Msg.Metadata
|
|
||||||
|
|
||||||
// Record sandbox/host association.
|
|
||||||
_ = s.DB.UpdateBuildSandbox(buildCtx, db.UpdateBuildSandboxParams{
|
|
||||||
ID: buildID,
|
|
||||||
SandboxID: sandboxID,
|
|
||||||
HostID: host.ID,
|
|
||||||
})
|
|
||||||
|
|
||||||
// Upload and extract build archive if provided.
|
|
||||||
archive := s.takeArchive(buildIDStr)
|
|
||||||
if len(archive) > 0 {
|
|
||||||
if err := s.uploadAndExtractArchive(buildCtx, agent, sandboxIDStr, archive, buildIDStr); err != nil {
|
|
||||||
s.destroySandbox(buildCtx, agent, sandboxIDStr)
|
|
||||||
s.failBuild(buildCtx, buildID, fmt.Sprintf("archive upload failed: %v", err))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Parse recipe steps. preBuildCmds and postBuildCmds are hardcoded and always
|
// Parse recipe steps. preBuildCmds and postBuildCmds are hardcoded and always
|
||||||
// valid; panic on error is appropriate here since it would be a programmer mistake.
|
// valid; panic on error is appropriate here since it would be a programmer mistake.
|
||||||
@ -435,81 +377,162 @@ func (s *BuildService) executeBuild(ctx context.Context, buildIDStr string) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Healthcheck or direct snapshot.
|
// Finalize: healthcheck/snapshot/flatten → persist template → mark success.
|
||||||
|
s.finalizeBuild(buildCtx, buildID, build, agent, sandboxIDStr, templateDefaultUser, templateDefaultEnv, sandboxMetadata, log)
|
||||||
|
}
|
||||||
|
|
||||||
|
// provisionBuildSandbox picks a host, creates a sandbox, and uploads the build
|
||||||
|
// archive. On failure it calls failBuild and returns an error.
|
||||||
|
func (s *BuildService) provisionBuildSandbox(
|
||||||
|
ctx context.Context,
|
||||||
|
buildID pgtype.UUID,
|
||||||
|
buildIDStr string,
|
||||||
|
build db.TemplateBuild,
|
||||||
|
log *slog.Logger,
|
||||||
|
) (buildAgentClient, string, map[string]string, error) {
|
||||||
|
host, err := s.Scheduler.SelectHost(ctx, id.PlatformTeamID, false, build.MemoryMb, 5120)
|
||||||
|
if err != nil {
|
||||||
|
s.failBuild(ctx, buildID, fmt.Sprintf("no host available: %v", err))
|
||||||
|
return nil, "", nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
agent, err := s.Pool.GetForHost(host)
|
||||||
|
if err != nil {
|
||||||
|
s.failBuild(ctx, buildID, fmt.Sprintf("agent client error: %v", err))
|
||||||
|
return nil, "", nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
sandboxID := id.NewSandboxID()
|
||||||
|
sandboxIDStr := id.FormatSandboxID(sandboxID)
|
||||||
|
log.Info("provisioning build sandbox", "sandbox_id", sandboxIDStr, "host_id", id.FormatHostID(host.ID))
|
||||||
|
|
||||||
|
baseTeamID := id.PlatformTeamID
|
||||||
|
baseTemplateID := id.MinimalTemplateID
|
||||||
|
if build.BaseTemplate != "minimal" {
|
||||||
|
baseTmpl, err := s.DB.GetPlatformTemplateByName(ctx, build.BaseTemplate)
|
||||||
|
if err != nil {
|
||||||
|
s.failBuild(ctx, buildID, fmt.Sprintf("base template %q not found: %v", build.BaseTemplate, err))
|
||||||
|
return nil, "", nil, err
|
||||||
|
}
|
||||||
|
baseTeamID = baseTmpl.TeamID
|
||||||
|
baseTemplateID = baseTmpl.ID
|
||||||
|
}
|
||||||
|
|
||||||
|
resp, err := agent.CreateSandbox(ctx, connect.NewRequest(&pb.CreateSandboxRequest{
|
||||||
|
SandboxId: sandboxIDStr,
|
||||||
|
Template: build.BaseTemplate,
|
||||||
|
TeamId: id.UUIDString(baseTeamID),
|
||||||
|
TemplateId: id.UUIDString(baseTemplateID),
|
||||||
|
Vcpus: build.Vcpus,
|
||||||
|
MemoryMb: build.MemoryMb,
|
||||||
|
TimeoutSec: 0,
|
||||||
|
DiskSizeMb: 5120,
|
||||||
|
}))
|
||||||
|
if err != nil {
|
||||||
|
s.failBuild(ctx, buildID, fmt.Sprintf("create sandbox failed: %v", err))
|
||||||
|
return nil, "", nil, err
|
||||||
|
}
|
||||||
|
sandboxMetadata := resp.Msg.Metadata
|
||||||
|
|
||||||
|
_ = s.DB.UpdateBuildSandbox(ctx, db.UpdateBuildSandboxParams{
|
||||||
|
ID: buildID,
|
||||||
|
SandboxID: sandboxID,
|
||||||
|
HostID: host.ID,
|
||||||
|
})
|
||||||
|
|
||||||
|
archive := s.takeArchive(buildIDStr)
|
||||||
|
if len(archive) > 0 {
|
||||||
|
if err := s.uploadAndExtractArchive(ctx, agent, sandboxIDStr, archive, buildIDStr); err != nil {
|
||||||
|
s.destroySandbox(ctx, agent, sandboxIDStr)
|
||||||
|
s.failBuild(ctx, buildID, fmt.Sprintf("archive upload failed: %v", err))
|
||||||
|
return nil, "", nil, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return agent, sandboxIDStr, sandboxMetadata, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// finalizeBuild handles the healthcheck/snapshot/flatten step and persists the
|
||||||
|
// template record. Called after all recipe phases complete successfully.
|
||||||
|
func (s *BuildService) finalizeBuild(
|
||||||
|
ctx context.Context,
|
||||||
|
buildID pgtype.UUID,
|
||||||
|
build db.TemplateBuild,
|
||||||
|
agent buildAgentClient,
|
||||||
|
sandboxIDStr string,
|
||||||
|
defaultUser string,
|
||||||
|
defaultEnv map[string]string,
|
||||||
|
sandboxMetadata map[string]string,
|
||||||
|
log *slog.Logger,
|
||||||
|
) {
|
||||||
var sizeBytes int64
|
var sizeBytes int64
|
||||||
if build.Healthcheck != "" {
|
if build.Healthcheck != "" {
|
||||||
hc, err := recipe.ParseHealthcheck(build.Healthcheck)
|
hc, err := recipe.ParseHealthcheck(build.Healthcheck)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
s.destroySandbox(buildCtx, agent, sandboxIDStr)
|
s.destroySandbox(ctx, agent, sandboxIDStr)
|
||||||
s.failBuild(buildCtx, buildID, fmt.Sprintf("invalid healthcheck: %v", err))
|
s.failBuild(ctx, buildID, fmt.Sprintf("invalid healthcheck: %v", err))
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
log.Info("running healthcheck", "cmd", hc.Cmd, "interval", hc.Interval, "timeout", hc.Timeout, "start_period", hc.StartPeriod, "retries", hc.Retries)
|
log.Info("running healthcheck", "cmd", hc.Cmd, "interval", hc.Interval, "timeout", hc.Timeout, "start_period", hc.StartPeriod, "retries", hc.Retries)
|
||||||
if err := s.waitForHealthcheck(buildCtx, agent, sandboxIDStr, hc, templateDefaultUser); err != nil {
|
if err := s.waitForHealthcheck(ctx, agent, sandboxIDStr, hc, defaultUser); err != nil {
|
||||||
s.destroySandbox(buildCtx, agent, sandboxIDStr)
|
s.destroySandbox(ctx, agent, sandboxIDStr)
|
||||||
if buildCtx.Err() != nil {
|
if ctx.Err() != nil {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
s.failBuild(buildCtx, buildID, fmt.Sprintf("healthcheck failed: %v", err))
|
s.failBuild(ctx, buildID, fmt.Sprintf("healthcheck failed: %v", err))
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// Healthcheck passed → full snapshot (with memory/CPU state).
|
|
||||||
log.Info("healthcheck passed, creating snapshot")
|
log.Info("healthcheck passed, creating snapshot")
|
||||||
snapResp, err := agent.CreateSnapshot(buildCtx, connect.NewRequest(&pb.CreateSnapshotRequest{
|
snapResp, err := agent.CreateSnapshot(ctx, connect.NewRequest(&pb.CreateSnapshotRequest{
|
||||||
SandboxId: sandboxIDStr,
|
SandboxId: sandboxIDStr,
|
||||||
Name: build.Name,
|
Name: build.Name,
|
||||||
TeamId: id.UUIDString(build.TeamID),
|
TeamId: id.UUIDString(build.TeamID),
|
||||||
TemplateId: id.UUIDString(build.TemplateID),
|
TemplateId: id.UUIDString(build.TemplateID),
|
||||||
}))
|
}))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
s.destroySandbox(buildCtx, agent, sandboxIDStr)
|
s.destroySandbox(ctx, agent, sandboxIDStr)
|
||||||
if buildCtx.Err() != nil {
|
if ctx.Err() != nil {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
s.failBuild(buildCtx, buildID, fmt.Sprintf("create snapshot failed: %v", err))
|
s.failBuild(ctx, buildID, fmt.Sprintf("create snapshot failed: %v", err))
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
sizeBytes = snapResp.Msg.SizeBytes
|
sizeBytes = snapResp.Msg.SizeBytes
|
||||||
} else {
|
} else {
|
||||||
// No healthcheck → image-only template (rootfs only).
|
|
||||||
log.Info("no healthcheck, flattening rootfs")
|
log.Info("no healthcheck, flattening rootfs")
|
||||||
flatResp, err := agent.FlattenRootfs(buildCtx, connect.NewRequest(&pb.FlattenRootfsRequest{
|
flatResp, err := agent.FlattenRootfs(ctx, connect.NewRequest(&pb.FlattenRootfsRequest{
|
||||||
SandboxId: sandboxIDStr,
|
SandboxId: sandboxIDStr,
|
||||||
Name: build.Name,
|
Name: build.Name,
|
||||||
TeamId: id.UUIDString(build.TeamID),
|
TeamId: id.UUIDString(build.TeamID),
|
||||||
TemplateId: id.UUIDString(build.TemplateID),
|
TemplateId: id.UUIDString(build.TemplateID),
|
||||||
}))
|
}))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
s.destroySandbox(buildCtx, agent, sandboxIDStr)
|
s.destroySandbox(ctx, agent, sandboxIDStr)
|
||||||
if buildCtx.Err() != nil {
|
if ctx.Err() != nil {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
s.failBuild(buildCtx, buildID, fmt.Sprintf("flatten rootfs failed: %v", err))
|
s.failBuild(ctx, buildID, fmt.Sprintf("flatten rootfs failed: %v", err))
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
sizeBytes = flatResp.Msg.SizeBytes
|
sizeBytes = flatResp.Msg.SizeBytes
|
||||||
}
|
}
|
||||||
|
|
||||||
// Insert into templates table as a global (platform) template.
|
|
||||||
templateType := "base"
|
templateType := "base"
|
||||||
if build.Healthcheck != "" {
|
if build.Healthcheck != "" {
|
||||||
templateType = "snapshot"
|
templateType = "snapshot"
|
||||||
}
|
}
|
||||||
|
|
||||||
// Serialize env vars for DB storage.
|
defaultEnvJSON, err := json.Marshal(defaultEnv)
|
||||||
defaultEnvJSON, err := json.Marshal(templateDefaultEnv)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
defaultEnvJSON = []byte("{}")
|
defaultEnvJSON = []byte("{}")
|
||||||
}
|
}
|
||||||
|
|
||||||
// Serialize sandbox metadata for DB storage.
|
|
||||||
metadataJSON, err := json.Marshal(sandboxMetadata)
|
metadataJSON, err := json.Marshal(sandboxMetadata)
|
||||||
if err != nil || len(sandboxMetadata) == 0 {
|
if err != nil || len(sandboxMetadata) == 0 {
|
||||||
metadataJSON = []byte("{}")
|
metadataJSON = []byte("{}")
|
||||||
}
|
}
|
||||||
|
|
||||||
if _, err := s.DB.InsertTemplate(buildCtx, db.InsertTemplateParams{
|
if _, err := s.DB.InsertTemplate(ctx, db.InsertTemplateParams{
|
||||||
ID: build.TemplateID,
|
ID: build.TemplateID,
|
||||||
Name: build.Name,
|
Name: build.Name,
|
||||||
Type: templateType,
|
Type: templateType,
|
||||||
@ -517,28 +540,21 @@ func (s *BuildService) executeBuild(ctx context.Context, buildIDStr string) {
|
|||||||
MemoryMb: build.MemoryMb,
|
MemoryMb: build.MemoryMb,
|
||||||
SizeBytes: sizeBytes,
|
SizeBytes: sizeBytes,
|
||||||
TeamID: id.PlatformTeamID,
|
TeamID: id.PlatformTeamID,
|
||||||
DefaultUser: templateDefaultUser,
|
DefaultUser: defaultUser,
|
||||||
DefaultEnv: defaultEnvJSON,
|
DefaultEnv: defaultEnvJSON,
|
||||||
Metadata: metadataJSON,
|
Metadata: metadataJSON,
|
||||||
}); err != nil {
|
}); err != nil {
|
||||||
log.Error("failed to insert template record", "error", err)
|
log.Error("failed to insert template record", "error", err)
|
||||||
// Build succeeded on disk, just DB record failed — don't mark as failed.
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Record defaults and metadata on the build record for inspection.
|
_ = s.DB.UpdateBuildDefaults(ctx, db.UpdateBuildDefaultsParams{
|
||||||
_ = s.DB.UpdateBuildDefaults(buildCtx, db.UpdateBuildDefaultsParams{
|
|
||||||
ID: buildID,
|
ID: buildID,
|
||||||
DefaultUser: templateDefaultUser,
|
DefaultUser: defaultUser,
|
||||||
DefaultEnv: defaultEnvJSON,
|
DefaultEnv: defaultEnvJSON,
|
||||||
Metadata: metadataJSON,
|
Metadata: metadataJSON,
|
||||||
})
|
})
|
||||||
|
|
||||||
// For CreateSnapshot, the sandbox is already destroyed by the snapshot process.
|
if _, err := s.DB.UpdateBuildStatus(ctx, db.UpdateBuildStatusParams{
|
||||||
// For FlattenRootfs, the sandbox is already destroyed by the flatten process.
|
|
||||||
// No additional destroy needed.
|
|
||||||
|
|
||||||
// Mark build as success.
|
|
||||||
if _, err := s.DB.UpdateBuildStatus(buildCtx, db.UpdateBuildStatusParams{
|
|
||||||
ID: buildID, Status: "success",
|
ID: buildID, Status: "success",
|
||||||
}); err != nil {
|
}); err != nil {
|
||||||
log.Error("failed to mark build as success", "error", err)
|
log.Error("failed to mark build as success", "error", err)
|
||||||
|
|||||||
Reference in New Issue
Block a user