1
0
forked from wrenn/wrenn

refactor: polish control plane and host agent code

- Decompose executeBuild (318 lines) into provisionBuildSandbox and
  finalizeBuild helpers for readability
- Extract cleanupPauseFailure in sandbox manager to unify 3 inconsistent
  inline teardown paths (also fixes CoW file leak on rename failure)
- Remove unused ctx parameter from startProcess/startProcessForRestore
- Add missing MASQUERADE rollback entry in CreateNetwork for symmetry
- Consolidate duplicate writeJSON for UTF-8/base64 exec response
This commit is contained in:
2026-05-17 02:11:48 +06:00
parent 124e097e23
commit 74f85ce4e9
6 changed files with 146 additions and 148 deletions

View File

@ -130,30 +130,22 @@ func (h *execHandler) Exec(w http.ResponseWriter, r *http.Request) {
updateLastActive(h.db, sandboxID, sandboxIDStr)
// Use base64 encoding if output contains non-UTF-8 bytes.
stdout := resp.Msg.Stdout
stderr := resp.Msg.Stderr
encoding := "utf-8"
encoding := "utf-8"
stdoutStr, stderrStr := string(stdout), string(stderr)
if !utf8.Valid(stdout) || !utf8.Valid(stderr) {
encoding = "base64"
writeJSON(w, http.StatusOK, execResponse{
SandboxID: sandboxIDStr,
Cmd: req.Cmd,
Stdout: base64.StdEncoding.EncodeToString(stdout),
Stderr: base64.StdEncoding.EncodeToString(stderr),
ExitCode: resp.Msg.ExitCode,
DurationMs: duration.Milliseconds(),
Encoding: encoding,
})
return
stdoutStr = base64.StdEncoding.EncodeToString(stdout)
stderrStr = base64.StdEncoding.EncodeToString(stderr)
}
writeJSON(w, http.StatusOK, execResponse{
SandboxID: sandboxIDStr,
Cmd: req.Cmd,
Stdout: string(stdout),
Stderr: string(stderr),
Stdout: stdoutStr,
Stderr: stderrStr,
ExitCode: resp.Msg.ExitCode,
DurationMs: duration.Milliseconds(),
Encoding: encoding,

View File

@ -430,6 +430,9 @@ func CreateNetwork(slot *Slot) error {
rollback()
return fmt.Errorf("add masquerade rule: %w", err)
}
rollbacks = append(rollbacks, func() {
_ = iptablesHost("-t", "nat", "-D", "POSTROUTING", "-s", fmt.Sprintf("%s/32", slot.VpeerIP.String()), "-o", defaultIface, "-j", "MASQUERADE")
})
slog.Info("network created",
"ns", slot.NamespaceID,

View File

@ -359,6 +359,25 @@ func (m *Manager) cleanup(ctx context.Context, sb *sandboxState) {
}
}
// cleanupPauseFailure is best-effort teardown when a pause operation fails
// after the VM has already been destroyed. It releases all resources and removes
// the sandbox from the in-memory map.
func (m *Manager) cleanupPauseFailure(sb *sandboxState, sandboxID string, pauseDir string) {
warnErr("snapshot dir cleanup error", sandboxID, os.RemoveAll(pauseDir))
warnErr("network cleanup error during pause", sandboxID, network.RemoveNetwork(sb.slot))
m.slots.Release(sb.SlotIndex)
if sb.dmDevice != nil {
warnErr("dm-snapshot remove error during pause", sandboxID, devicemapper.RemoveSnapshot(context.Background(), sb.dmDevice))
os.Remove(sb.dmDevice.CowPath)
}
if sb.baseImagePath != "" {
m.loops.Release(sb.baseImagePath)
}
m.mu.Lock()
delete(m.boxes, sandboxID)
m.mu.Unlock()
}
// Pause takes a snapshot of a running sandbox, then destroys all resources.
// The sandbox's snapshot files are stored at SnapshotsDir/{sandboxID}/.
// After this call, the sandbox is no longer running but can be resumed.
@ -513,45 +532,21 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error {
slog.Warn("pause: failed to remove old snapshot dir", "id", sandboxID, "error", err)
}
if err := os.Rename(tmpPauseDir, pauseDir); err != nil {
warnErr("network cleanup error during pause", sandboxID, network.RemoveNetwork(sb.slot))
m.slots.Release(sb.SlotIndex)
if sb.dmDevice != nil {
warnErr("dm-snapshot remove error during pause", sandboxID, devicemapper.RemoveSnapshot(context.Background(), sb.dmDevice))
os.Remove(sb.dmDevice.CowPath)
}
if sb.baseImagePath != "" {
m.loops.Release(sb.baseImagePath)
}
m.mu.Lock()
delete(m.boxes, sandboxID)
m.mu.Unlock()
m.cleanupPauseFailure(sb, sandboxID, pauseDir)
return fmt.Errorf("rename snapshot dir: %w", err)
}
// ── Step 7: Remove dm-snapshot and save CoW ──────────────────────
if sb.dmDevice != nil {
if err := devicemapper.RemoveSnapshot(ctx, sb.dmDevice); err != nil {
warnErr("network cleanup error during pause", sandboxID, network.RemoveNetwork(sb.slot))
m.slots.Release(sb.SlotIndex)
warnErr("snapshot dir cleanup error", sandboxID, os.RemoveAll(pauseDir))
m.mu.Lock()
delete(m.boxes, sandboxID)
m.mu.Unlock()
m.cleanupPauseFailure(sb, sandboxID, pauseDir)
return fmt.Errorf("remove dm-snapshot: %w", err)
}
snapshotCow := snapshot.CowPath(pauseDir, "")
if err := os.Rename(sb.dmDevice.CowPath, snapshotCow); err != nil {
warnErr("snapshot dir cleanup error", sandboxID, os.RemoveAll(pauseDir))
warnErr("network cleanup error during pause", sandboxID, network.RemoveNetwork(sb.slot))
m.slots.Release(sb.SlotIndex)
os.Remove(sb.dmDevice.CowPath)
if sb.baseImagePath != "" {
m.loops.Release(sb.baseImagePath)
}
m.mu.Lock()
delete(m.boxes, sandboxID)
m.mu.Unlock()
m.cleanupPauseFailure(sb, sandboxID, pauseDir)
return fmt.Errorf("move cow file: %w", err)
}
@ -561,15 +556,7 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error {
VCPUs: sb.VCPUs,
MemoryMB: sb.MemoryMB,
}); err != nil {
warnErr("snapshot dir cleanup error", sandboxID, os.RemoveAll(pauseDir))
warnErr("network cleanup error during pause", sandboxID, network.RemoveNetwork(sb.slot))
m.slots.Release(sb.SlotIndex)
if sb.baseImagePath != "" {
m.loops.Release(sb.baseImagePath)
}
m.mu.Lock()
delete(m.boxes, sandboxID)
m.mu.Unlock()
m.cleanupPauseFailure(sb, sandboxID, pauseDir)
return fmt.Errorf("write rootfs meta: %w", err)
}
}

View File

@ -47,7 +47,7 @@ func (m *Manager) Create(ctx context.Context, cfg VMConfig) (*VM, error) {
)
// Step 1: Launch the Cloud Hypervisor process.
proc, err := startProcess(ctx, &cfg)
proc, err := startProcess(&cfg)
if err != nil {
return nil, fmt.Errorf("start process: %w", err)
}
@ -220,7 +220,7 @@ func (m *Manager) CreateFromSnapshot(ctx context.Context, cfg VMConfig, snapshot
)
// Step 1: Launch bare CH process (no --restore).
proc, err := startProcessForRestore(ctx, &cfg)
proc, err := startProcessForRestore(&cfg)
if err != nil {
return nil, fmt.Errorf("start process: %w", err)
}

View File

@ -30,7 +30,7 @@ type process struct {
// 4. symlink kernel and rootfs into SandboxDir
// 5. ip netns exec <ns>: enters the network namespace where TAP is configured
// 6. exec cloud-hypervisor with the API socket path
func startProcess(ctx context.Context, cfg *VMConfig) (*process, error) {
func startProcess(cfg *VMConfig) (*process, error) {
script := buildStartScript(cfg)
return launchScript(script, cfg)
}
@ -38,7 +38,7 @@ func startProcess(ctx context.Context, cfg *VMConfig) (*process, error) {
// startProcessForRestore launches a bare Cloud Hypervisor process (no --restore).
// The restore is performed via the API after the socket is ready, which allows
// passing memory_restore_mode=OnDemand for UFFD lazy paging.
func startProcessForRestore(ctx context.Context, cfg *VMConfig) (*process, error) {
func startProcessForRestore(cfg *VMConfig) (*process, error) {
script := buildRestoreScript(cfg)
return launchScript(script, cfg)
}