diff --git a/internal/devicemapper/devicemapper.go b/internal/devicemapper/devicemapper.go index c44f030..ea14fcd 100644 --- a/internal/devicemapper/devicemapper.go +++ b/internal/devicemapper/devicemapper.go @@ -5,6 +5,7 @@ package devicemapper import ( + "context" "fmt" "log/slog" "os" @@ -12,6 +13,7 @@ import ( "strconv" "strings" "sync" + "time" ) const ( @@ -155,12 +157,12 @@ func CreateSnapshot(name, originLoopDev, cowPath string, originSizeBytes int64) // RestoreSnapshot re-attaches a dm-snapshot from an existing persistent CoW file. // The CoW file must have been created with the persistent (P) flag and still // contain valid dm-snapshot metadata. -func RestoreSnapshot(name, originLoopDev, cowPath string, originSizeBytes int64) (*SnapshotDevice, error) { +func RestoreSnapshot(ctx context.Context, name, originLoopDev, cowPath string, originSizeBytes int64) (*SnapshotDevice, error) { // Defensively remove a stale device with the same name. This can happen // if a previous pause failed to clean up the dm device (e.g. "device busy"). if dmDeviceExists(name) { slog.Warn("removing stale dm device before restore", "name", name) - if err := dmsetupRemove(name); err != nil { + if err := dmsetupRemove(ctx, name); err != nil { return nil, fmt.Errorf("remove stale device %s: %w", name, err) } } @@ -197,8 +199,8 @@ func RestoreSnapshot(name, originLoopDev, cowPath string, originSizeBytes int64) // RemoveSnapshot tears down a dm-snapshot device and its CoW loop device. // The CoW file is NOT deleted — the caller decides whether to keep or remove it. -func RemoveSnapshot(dev *SnapshotDevice) error { - if err := dmsetupRemove(dev.Name); err != nil { +func RemoveSnapshot(ctx context.Context, dev *SnapshotDevice) error { + if err := dmsetupRemove(ctx, dev.Name); err != nil { return fmt.Errorf("dmsetup remove %s: %w", dev.Name, err) } @@ -259,7 +261,7 @@ func CleanupStaleDevices() { } slog.Warn("removing stale dm-snapshot device", "name", name) - if err := dmsetupRemove(name); err != nil { + if err := dmsetupRemove(context.Background(), name); err != nil { slog.Warn("failed to remove stale device", "name", name, "error", err) } } @@ -307,13 +309,40 @@ func dmDeviceExists(name string) bool { return exec.Command("dmsetup", "info", name).Run() == nil } -// dmsetupRemove removes a device-mapper device. -func dmsetupRemove(name string) error { - cmd := exec.Command("dmsetup", "remove", name) - if out, err := cmd.CombinedOutput(); err != nil { - return fmt.Errorf("%s: %w", strings.TrimSpace(string(out)), err) +// dmsetupRemove removes a device-mapper device, retrying on transient +// "device busy" errors that occur when the kernel hasn't fully released +// the device after a Firecracker process exits. +func dmsetupRemove(ctx context.Context, name string) error { + var lastErr error + for attempt := range 5 { + if attempt > 0 { + select { + case <-ctx.Done(): + return fmt.Errorf("context cancelled while retrying dmsetup remove %s: %w", name, lastErr) + case <-time.After(200 * time.Millisecond): + } + } + cmd := exec.CommandContext(ctx, "dmsetup", "remove", name) + out, err := cmd.CombinedOutput() + if err == nil { + return nil + } + // If the context was cancelled, the process was killed and its + // output is unreliable. Return the context error directly so + // callers can distinguish cancellation from a real dm failure. + if ctx.Err() != nil { + return fmt.Errorf("dmsetup remove %s: %w", name, ctx.Err()) + } + outStr := strings.TrimSpace(string(out)) + lastErr = fmt.Errorf("%s: %w", outStr, err) + // Only retry on transient "busy" errors. Other failures + // (device not found, permission denied) are permanent. + if !strings.Contains(outStr, "Device or resource busy") { + return lastErr + } + slog.Debug("dmsetup remove retry", "name", name, "attempt", attempt+1, "error", lastErr) } - return nil + return lastErr } // createSparseFile creates a sparse file of the given size. diff --git a/internal/sandbox/manager.go b/internal/sandbox/manager.go index c166ef2..62fe1bc 100644 --- a/internal/sandbox/manager.go +++ b/internal/sandbox/manager.go @@ -139,7 +139,7 @@ func (m *Manager) Create(ctx context.Context, sandboxID, template string, vcpus, // Allocate network slot. slotIdx, err := m.slots.Allocate() if err != nil { - warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev)) + warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev)) os.Remove(cowPath) m.loops.Release(baseRootfs) return nil, fmt.Errorf("allocate network slot: %w", err) @@ -149,7 +149,7 @@ func (m *Manager) Create(ctx context.Context, sandboxID, template string, vcpus, // Set up network. if err := network.CreateNetwork(slot); err != nil { m.slots.Release(slotIdx) - warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev)) + warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev)) os.Remove(cowPath) m.loops.Release(baseRootfs) return nil, fmt.Errorf("create network: %w", err) @@ -173,7 +173,7 @@ func (m *Manager) Create(ctx context.Context, sandboxID, template string, vcpus, if _, err := m.vm.Create(ctx, vmCfg); err != nil { warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot)) m.slots.Release(slotIdx) - warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev)) + warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev)) os.Remove(cowPath) m.loops.Release(baseRootfs) return nil, fmt.Errorf("create VM: %w", err) @@ -188,7 +188,7 @@ func (m *Manager) Create(ctx context.Context, sandboxID, template string, vcpus, warnErr("vm destroy error", sandboxID, m.vm.Destroy(context.Background(), sandboxID)) warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot)) m.slots.Release(slotIdx) - warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev)) + warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev)) os.Remove(cowPath) m.loops.Release(baseRootfs) return nil, fmt.Errorf("wait for envd: %w", err) @@ -262,7 +262,7 @@ func (m *Manager) cleanup(ctx context.Context, sb *sandboxState) { // Tear down dm-snapshot and release the base image loop device. if sb.dmDevice != nil { - if err := devicemapper.RemoveSnapshot(sb.dmDevice); err != nil { + if err := devicemapper.RemoveSnapshot(context.Background(), sb.dmDevice); err != nil { slog.Warn("dm-snapshot remove error", "id", sb.ID, "error", err) } os.Remove(sb.dmDevice.CowPath) @@ -304,8 +304,18 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error { snapshotType = "Diff" } + // resumeOnError unpauses the VM so the sandbox stays usable when a + // post-freeze step fails. If the resume itself fails, the sandbox is + // left frozen — the caller should destroy it. + resumeOnError := func() { + if err := m.vm.Resume(ctx, sandboxID); err != nil { + slog.Error("failed to resume VM after pause error — sandbox is frozen", "id", sandboxID, "error", err) + } + } + // Step 2: Take VM state snapshot (snapfile + memfile). if err := snapshot.EnsureDir(m.cfg.SnapshotsDir, sandboxID); err != nil { + resumeOnError() return fmt.Errorf("create snapshot dir: %w", err) } @@ -316,6 +326,7 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error { snapshotStart := time.Now() if err := m.vm.Snapshot(ctx, sandboxID, snapPath, rawMemPath, snapshotType); err != nil { warnErr("snapshot dir cleanup error", sandboxID, snapshot.Remove(m.cfg.SnapshotsDir, sandboxID)) + resumeOnError() return fmt.Errorf("create VM snapshot: %w", err) } slog.Debug("pause: FC snapshot created", "id", sandboxID, "type", snapshotType, "elapsed", time.Since(snapshotStart)) @@ -330,6 +341,7 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error { diffPath := snapshot.MemDiffPathForBuild(m.cfg.SnapshotsDir, sandboxID, buildID) if _, err := snapshot.ProcessMemfileWithParent(rawMemPath, diffPath, headerPath, sb.parent.header, buildID); err != nil { warnErr("snapshot dir cleanup error", sandboxID, snapshot.Remove(m.cfg.SnapshotsDir, sandboxID)) + resumeOnError() return fmt.Errorf("process memfile with parent: %w", err) } @@ -339,6 +351,7 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error { if prevPath != dstPath { if err := copyFile(prevPath, dstPath); err != nil { warnErr("snapshot dir cleanup error", sandboxID, snapshot.Remove(m.cfg.SnapshotsDir, sandboxID)) + resumeOnError() return fmt.Errorf("copy parent diff file: %w", err) } } @@ -348,6 +361,7 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error { diffPath := snapshot.MemDiffPath(m.cfg.SnapshotsDir, sandboxID) if _, err := snapshot.ProcessMemfile(rawMemPath, diffPath, headerPath, buildID); err != nil { warnErr("snapshot dir cleanup error", sandboxID, snapshot.Remove(m.cfg.SnapshotsDir, sandboxID)) + resumeOnError() return fmt.Errorf("process memfile: %w", err) } } @@ -363,15 +377,16 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error { // Step 5: Now that FC is gone, safely remove the dm-snapshot and save the CoW. if sb.dmDevice != nil { - if err := devicemapper.RemoveSnapshot(sb.dmDevice); err != nil { + if err := devicemapper.RemoveSnapshot(ctx, sb.dmDevice); err != nil { // Hard error: if the dm device isn't removed, the CoW file is still - // in use and we can't safely move it. The snapshot files from step 2-3 - // are cleaned up, but the sandbox resources remain so the user can retry. + // in use and we can't safely move it. The VM is already destroyed so + // the sandbox is unrecoverable — clean up remaining resources. + // Note: we intentionally skip m.loops.Release here because the stale + // dm device still references the origin loop device. Detaching it now + // would corrupt the dm device. CleanupStaleDevices handles this on + // next agent startup. warnErr("network cleanup error during pause", sandboxID, network.RemoveNetwork(sb.slot)) m.slots.Release(sb.SlotIndex) - if sb.baseImagePath != "" { - m.loops.Release(sb.baseImagePath) - } if sb.uffdSocketPath != "" { os.Remove(sb.uffdSocketPath) } @@ -386,6 +401,18 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error { snapshotCow := snapshot.CowPath(m.cfg.SnapshotsDir, sandboxID) if err := os.Rename(sb.dmDevice.CowPath, snapshotCow); err != nil { warnErr("snapshot dir cleanup error", sandboxID, snapshot.Remove(m.cfg.SnapshotsDir, sandboxID)) + // VM and dm-snapshot are already gone — clean up remaining resources. + warnErr("network cleanup error during pause", sandboxID, network.RemoveNetwork(sb.slot)) + m.slots.Release(sb.SlotIndex) + if sb.baseImagePath != "" { + m.loops.Release(sb.baseImagePath) + } + if sb.uffdSocketPath != "" { + os.Remove(sb.uffdSocketPath) + } + m.mu.Lock() + delete(m.boxes, sandboxID) + m.mu.Unlock() return fmt.Errorf("move cow file: %w", err) } @@ -394,6 +421,18 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error { BaseTemplate: sb.baseImagePath, }); err != nil { warnErr("snapshot dir cleanup error", sandboxID, snapshot.Remove(m.cfg.SnapshotsDir, sandboxID)) + // VM and dm-snapshot are already gone — clean up remaining resources. + warnErr("network cleanup error during pause", sandboxID, network.RemoveNetwork(sb.slot)) + m.slots.Release(sb.SlotIndex) + if sb.baseImagePath != "" { + m.loops.Release(sb.baseImagePath) + } + if sb.uffdSocketPath != "" { + os.Remove(sb.uffdSocketPath) + } + m.mu.Lock() + delete(m.boxes, sandboxID) + m.mu.Unlock() return fmt.Errorf("write rootfs meta: %w", err) } } @@ -489,7 +528,7 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string) (*models.Sandbox // Restore dm-snapshot from existing persistent CoW file. dmName := "wrenn-" + sandboxID - dmDev, err := devicemapper.RestoreSnapshot(dmName, originLoop, cowPath, originSize) + dmDev, err := devicemapper.RestoreSnapshot(ctx, dmName, originLoop, cowPath, originSize) if err != nil { source.Close() m.loops.Release(baseImagePath) @@ -501,7 +540,7 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string) (*models.Sandbox slotIdx, err := m.slots.Allocate() if err != nil { source.Close() - warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev)) + warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev)) rollbackCow() m.loops.Release(baseImagePath) return nil, fmt.Errorf("allocate network slot: %w", err) @@ -511,7 +550,7 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string) (*models.Sandbox if err := network.CreateNetwork(slot); err != nil { source.Close() m.slots.Release(slotIdx) - warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev)) + warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev)) rollbackCow() m.loops.Release(baseImagePath) return nil, fmt.Errorf("create network: %w", err) @@ -525,7 +564,7 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string) (*models.Sandbox source.Close() warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot)) m.slots.Release(slotIdx) - warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev)) + warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev)) rollbackCow() m.loops.Release(baseImagePath) return nil, fmt.Errorf("start uffd server: %w", err) @@ -536,8 +575,8 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string) (*models.Sandbox SandboxID: sandboxID, KernelPath: m.cfg.KernelPath, RootfsPath: dmDev.DevicePath, - VCPUs: int(header.Metadata.Size / (1024 * 1024)), // Will be overridden by snapshot. - MemoryMB: int(header.Metadata.Size / (1024 * 1024)), + VCPUs: 1, // Placeholder; overridden by snapshot. + MemoryMB: int(header.Metadata.Size / (1024 * 1024)), // Placeholder; overridden by snapshot. NetworkNamespace: slot.NamespaceID, TapDevice: slot.TapName, TapMAC: slot.TapMAC, @@ -552,7 +591,7 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string) (*models.Sandbox source.Close() warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot)) m.slots.Release(slotIdx) - warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev)) + warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev)) rollbackCow() m.loops.Release(baseImagePath) return nil, fmt.Errorf("restore VM from snapshot: %w", err) @@ -569,8 +608,8 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string) (*models.Sandbox warnErr("vm destroy error", sandboxID, m.vm.Destroy(context.Background(), sandboxID)) warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot)) m.slots.Release(slotIdx) - warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev)) - os.Remove(cowPath) + warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev)) + rollbackCow() m.loops.Release(baseImagePath) return nil, fmt.Errorf("wait for envd: %w", err) } @@ -709,7 +748,7 @@ func (m *Manager) CreateSnapshot(ctx context.Context, sandboxID, name string) (i // Temporarily restore the dm-snapshot to read the merged view. cowPath := snapshot.CowPath(m.cfg.SnapshotsDir, sandboxID) tmpDmName := "wrenn-flatten-" + sandboxID - tmpDev, err := devicemapper.RestoreSnapshot(tmpDmName, originLoop, cowPath, originSize) + tmpDev, err := devicemapper.RestoreSnapshot(ctx, tmpDmName, originLoop, cowPath, originSize) if err != nil { m.loops.Release(meta.BaseTemplate) warnErr("template dir cleanup error", name, snapshot.Remove(m.cfg.ImagesDir, name)) @@ -721,7 +760,7 @@ func (m *Manager) CreateSnapshot(ctx context.Context, sandboxID, name string) (i flattenErr := devicemapper.FlattenSnapshot(tmpDev.DevicePath, flattenedPath) // Always clean up the temporary dm device. - warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(tmpDev)) + warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), tmpDev)) m.loops.Release(meta.BaseTemplate) if flattenErr != nil { @@ -810,7 +849,7 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID, snapshotNam slotIdx, err := m.slots.Allocate() if err != nil { source.Close() - warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev)) + warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev)) os.Remove(cowPath) m.loops.Release(baseRootfs) return nil, fmt.Errorf("allocate network slot: %w", err) @@ -820,7 +859,7 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID, snapshotNam if err := network.CreateNetwork(slot); err != nil { source.Close() m.slots.Release(slotIdx) - warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev)) + warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev)) os.Remove(cowPath) m.loops.Release(baseRootfs) return nil, fmt.Errorf("create network: %w", err) @@ -834,7 +873,7 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID, snapshotNam source.Close() warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot)) m.slots.Release(slotIdx) - warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev)) + warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev)) os.Remove(cowPath) m.loops.Release(baseRootfs) return nil, fmt.Errorf("start uffd server: %w", err) @@ -861,7 +900,7 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID, snapshotNam source.Close() warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot)) m.slots.Release(slotIdx) - warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev)) + warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev)) os.Remove(cowPath) m.loops.Release(baseRootfs) return nil, fmt.Errorf("restore VM from snapshot: %w", err) @@ -878,7 +917,7 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID, snapshotNam warnErr("vm destroy error", sandboxID, m.vm.Destroy(context.Background(), sandboxID)) warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot)) m.slots.Release(slotIdx) - warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev)) + warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev)) os.Remove(cowPath) m.loops.Release(baseRootfs) return nil, fmt.Errorf("wait for envd: %w", err)