diff --git a/internal/devicemapper/devicemapper.go b/internal/devicemapper/devicemapper.go index 78801d3..c44f030 100644 --- a/internal/devicemapper/devicemapper.go +++ b/internal/devicemapper/devicemapper.go @@ -156,6 +156,15 @@ func CreateSnapshot(name, originLoopDev, cowPath string, originSizeBytes int64) // The CoW file must have been created with the persistent (P) flag and still // contain valid dm-snapshot metadata. func RestoreSnapshot(name, originLoopDev, cowPath string, originSizeBytes int64) (*SnapshotDevice, error) { + // Defensively remove a stale device with the same name. This can happen + // if a previous pause failed to clean up the dm device (e.g. "device busy"). + if dmDeviceExists(name) { + slog.Warn("removing stale dm device before restore", "name", name) + if err := dmsetupRemove(name); err != nil { + return nil, fmt.Errorf("remove stale device %s: %w", name, err) + } + } + cowLoopDev, err := losetupCreateRW(cowPath) if err != nil { return nil, fmt.Errorf("losetup cow: %w", err) @@ -293,6 +302,11 @@ func dmsetupCreate(name, originDev, cowDev string, sectors int64) error { return nil } +// dmDeviceExists checks whether a device-mapper device with the given name exists. +func dmDeviceExists(name string) bool { + return exec.Command("dmsetup", "info", name).Run() == nil +} + // dmsetupRemove removes a device-mapper device. func dmsetupRemove(name string) error { cmd := exec.Command("dmsetup", "remove", name) diff --git a/internal/sandbox/manager.go b/internal/sandbox/manager.go index c030653..c166ef2 100644 --- a/internal/sandbox/manager.go +++ b/internal/sandbox/manager.go @@ -364,7 +364,22 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error { // Step 5: Now that FC is gone, safely remove the dm-snapshot and save the CoW. if sb.dmDevice != nil { if err := devicemapper.RemoveSnapshot(sb.dmDevice); err != nil { - warnErr("dm-snapshot remove error during pause", sandboxID, err) + // Hard error: if the dm device isn't removed, the CoW file is still + // in use and we can't safely move it. The snapshot files from step 2-3 + // are cleaned up, but the sandbox resources remain so the user can retry. + warnErr("network cleanup error during pause", sandboxID, network.RemoveNetwork(sb.slot)) + m.slots.Release(sb.SlotIndex) + if sb.baseImagePath != "" { + m.loops.Release(sb.baseImagePath) + } + if sb.uffdSocketPath != "" { + os.Remove(sb.uffdSocketPath) + } + warnErr("snapshot dir cleanup error", sandboxID, snapshot.Remove(m.cfg.SnapshotsDir, sandboxID)) + m.mu.Lock() + delete(m.boxes, sandboxID) + m.mu.Unlock() + return fmt.Errorf("remove dm-snapshot: %w", err) } // Move (not copy) the CoW file into the snapshot directory.