forked from wrenn/wrenn
refactor: polish control plane and host agent code
- Decompose executeBuild (318 lines) into provisionBuildSandbox and finalizeBuild helpers for readability - Extract cleanupPauseFailure in sandbox manager to unify 3 inconsistent inline teardown paths (also fixes CoW file leak on rename failure) - Remove unused ctx parameter from startProcess/startProcessForRestore - Add missing MASQUERADE rollback entry in CreateNetwork for symmetry - Consolidate duplicate writeJSON for UTF-8/base64 exec response
This commit is contained in:
@ -130,30 +130,22 @@ func (h *execHandler) Exec(w http.ResponseWriter, r *http.Request) {
|
||||
|
||||
updateLastActive(h.db, sandboxID, sandboxIDStr)
|
||||
|
||||
// Use base64 encoding if output contains non-UTF-8 bytes.
|
||||
stdout := resp.Msg.Stdout
|
||||
stderr := resp.Msg.Stderr
|
||||
encoding := "utf-8"
|
||||
|
||||
encoding := "utf-8"
|
||||
stdoutStr, stderrStr := string(stdout), string(stderr)
|
||||
if !utf8.Valid(stdout) || !utf8.Valid(stderr) {
|
||||
encoding = "base64"
|
||||
writeJSON(w, http.StatusOK, execResponse{
|
||||
SandboxID: sandboxIDStr,
|
||||
Cmd: req.Cmd,
|
||||
Stdout: base64.StdEncoding.EncodeToString(stdout),
|
||||
Stderr: base64.StdEncoding.EncodeToString(stderr),
|
||||
ExitCode: resp.Msg.ExitCode,
|
||||
DurationMs: duration.Milliseconds(),
|
||||
Encoding: encoding,
|
||||
})
|
||||
return
|
||||
stdoutStr = base64.StdEncoding.EncodeToString(stdout)
|
||||
stderrStr = base64.StdEncoding.EncodeToString(stderr)
|
||||
}
|
||||
|
||||
writeJSON(w, http.StatusOK, execResponse{
|
||||
SandboxID: sandboxIDStr,
|
||||
Cmd: req.Cmd,
|
||||
Stdout: string(stdout),
|
||||
Stderr: string(stderr),
|
||||
Stdout: stdoutStr,
|
||||
Stderr: stderrStr,
|
||||
ExitCode: resp.Msg.ExitCode,
|
||||
DurationMs: duration.Milliseconds(),
|
||||
Encoding: encoding,
|
||||
|
||||
@ -430,6 +430,9 @@ func CreateNetwork(slot *Slot) error {
|
||||
rollback()
|
||||
return fmt.Errorf("add masquerade rule: %w", err)
|
||||
}
|
||||
rollbacks = append(rollbacks, func() {
|
||||
_ = iptablesHost("-t", "nat", "-D", "POSTROUTING", "-s", fmt.Sprintf("%s/32", slot.VpeerIP.String()), "-o", defaultIface, "-j", "MASQUERADE")
|
||||
})
|
||||
|
||||
slog.Info("network created",
|
||||
"ns", slot.NamespaceID,
|
||||
|
||||
@ -359,6 +359,25 @@ func (m *Manager) cleanup(ctx context.Context, sb *sandboxState) {
|
||||
}
|
||||
}
|
||||
|
||||
// cleanupPauseFailure is best-effort teardown when a pause operation fails
|
||||
// after the VM has already been destroyed. It releases all resources and removes
|
||||
// the sandbox from the in-memory map.
|
||||
func (m *Manager) cleanupPauseFailure(sb *sandboxState, sandboxID string, pauseDir string) {
|
||||
warnErr("snapshot dir cleanup error", sandboxID, os.RemoveAll(pauseDir))
|
||||
warnErr("network cleanup error during pause", sandboxID, network.RemoveNetwork(sb.slot))
|
||||
m.slots.Release(sb.SlotIndex)
|
||||
if sb.dmDevice != nil {
|
||||
warnErr("dm-snapshot remove error during pause", sandboxID, devicemapper.RemoveSnapshot(context.Background(), sb.dmDevice))
|
||||
os.Remove(sb.dmDevice.CowPath)
|
||||
}
|
||||
if sb.baseImagePath != "" {
|
||||
m.loops.Release(sb.baseImagePath)
|
||||
}
|
||||
m.mu.Lock()
|
||||
delete(m.boxes, sandboxID)
|
||||
m.mu.Unlock()
|
||||
}
|
||||
|
||||
// Pause takes a snapshot of a running sandbox, then destroys all resources.
|
||||
// The sandbox's snapshot files are stored at SnapshotsDir/{sandboxID}/.
|
||||
// After this call, the sandbox is no longer running but can be resumed.
|
||||
@ -513,45 +532,21 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error {
|
||||
slog.Warn("pause: failed to remove old snapshot dir", "id", sandboxID, "error", err)
|
||||
}
|
||||
if err := os.Rename(tmpPauseDir, pauseDir); err != nil {
|
||||
warnErr("network cleanup error during pause", sandboxID, network.RemoveNetwork(sb.slot))
|
||||
m.slots.Release(sb.SlotIndex)
|
||||
if sb.dmDevice != nil {
|
||||
warnErr("dm-snapshot remove error during pause", sandboxID, devicemapper.RemoveSnapshot(context.Background(), sb.dmDevice))
|
||||
os.Remove(sb.dmDevice.CowPath)
|
||||
}
|
||||
if sb.baseImagePath != "" {
|
||||
m.loops.Release(sb.baseImagePath)
|
||||
}
|
||||
m.mu.Lock()
|
||||
delete(m.boxes, sandboxID)
|
||||
m.mu.Unlock()
|
||||
m.cleanupPauseFailure(sb, sandboxID, pauseDir)
|
||||
return fmt.Errorf("rename snapshot dir: %w", err)
|
||||
}
|
||||
|
||||
// ── Step 7: Remove dm-snapshot and save CoW ──────────────────────
|
||||
if sb.dmDevice != nil {
|
||||
if err := devicemapper.RemoveSnapshot(ctx, sb.dmDevice); err != nil {
|
||||
warnErr("network cleanup error during pause", sandboxID, network.RemoveNetwork(sb.slot))
|
||||
m.slots.Release(sb.SlotIndex)
|
||||
warnErr("snapshot dir cleanup error", sandboxID, os.RemoveAll(pauseDir))
|
||||
m.mu.Lock()
|
||||
delete(m.boxes, sandboxID)
|
||||
m.mu.Unlock()
|
||||
m.cleanupPauseFailure(sb, sandboxID, pauseDir)
|
||||
return fmt.Errorf("remove dm-snapshot: %w", err)
|
||||
}
|
||||
|
||||
snapshotCow := snapshot.CowPath(pauseDir, "")
|
||||
if err := os.Rename(sb.dmDevice.CowPath, snapshotCow); err != nil {
|
||||
warnErr("snapshot dir cleanup error", sandboxID, os.RemoveAll(pauseDir))
|
||||
warnErr("network cleanup error during pause", sandboxID, network.RemoveNetwork(sb.slot))
|
||||
m.slots.Release(sb.SlotIndex)
|
||||
os.Remove(sb.dmDevice.CowPath)
|
||||
if sb.baseImagePath != "" {
|
||||
m.loops.Release(sb.baseImagePath)
|
||||
}
|
||||
m.mu.Lock()
|
||||
delete(m.boxes, sandboxID)
|
||||
m.mu.Unlock()
|
||||
m.cleanupPauseFailure(sb, sandboxID, pauseDir)
|
||||
return fmt.Errorf("move cow file: %w", err)
|
||||
}
|
||||
|
||||
@ -561,15 +556,7 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error {
|
||||
VCPUs: sb.VCPUs,
|
||||
MemoryMB: sb.MemoryMB,
|
||||
}); err != nil {
|
||||
warnErr("snapshot dir cleanup error", sandboxID, os.RemoveAll(pauseDir))
|
||||
warnErr("network cleanup error during pause", sandboxID, network.RemoveNetwork(sb.slot))
|
||||
m.slots.Release(sb.SlotIndex)
|
||||
if sb.baseImagePath != "" {
|
||||
m.loops.Release(sb.baseImagePath)
|
||||
}
|
||||
m.mu.Lock()
|
||||
delete(m.boxes, sandboxID)
|
||||
m.mu.Unlock()
|
||||
m.cleanupPauseFailure(sb, sandboxID, pauseDir)
|
||||
return fmt.Errorf("write rootfs meta: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
@ -47,7 +47,7 @@ func (m *Manager) Create(ctx context.Context, cfg VMConfig) (*VM, error) {
|
||||
)
|
||||
|
||||
// Step 1: Launch the Cloud Hypervisor process.
|
||||
proc, err := startProcess(ctx, &cfg)
|
||||
proc, err := startProcess(&cfg)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("start process: %w", err)
|
||||
}
|
||||
@ -220,7 +220,7 @@ func (m *Manager) CreateFromSnapshot(ctx context.Context, cfg VMConfig, snapshot
|
||||
)
|
||||
|
||||
// Step 1: Launch bare CH process (no --restore).
|
||||
proc, err := startProcessForRestore(ctx, &cfg)
|
||||
proc, err := startProcessForRestore(&cfg)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("start process: %w", err)
|
||||
}
|
||||
|
||||
@ -30,7 +30,7 @@ type process struct {
|
||||
// 4. symlink kernel and rootfs into SandboxDir
|
||||
// 5. ip netns exec <ns>: enters the network namespace where TAP is configured
|
||||
// 6. exec cloud-hypervisor with the API socket path
|
||||
func startProcess(ctx context.Context, cfg *VMConfig) (*process, error) {
|
||||
func startProcess(cfg *VMConfig) (*process, error) {
|
||||
script := buildStartScript(cfg)
|
||||
return launchScript(script, cfg)
|
||||
}
|
||||
@ -38,7 +38,7 @@ func startProcess(ctx context.Context, cfg *VMConfig) (*process, error) {
|
||||
// startProcessForRestore launches a bare Cloud Hypervisor process (no --restore).
|
||||
// The restore is performed via the API after the socket is ready, which allows
|
||||
// passing memory_restore_mode=OnDemand for UFFD lazy paging.
|
||||
func startProcessForRestore(ctx context.Context, cfg *VMConfig) (*process, error) {
|
||||
func startProcessForRestore(cfg *VMConfig) (*process, error) {
|
||||
script := buildRestoreScript(cfg)
|
||||
return launchScript(script, cfg)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user