Fix device-mapper "Device or resource busy" error on sandbox resume

Pause was logging RemoveSnapshot failures as warnings and continuing,
which left stale dm devices behind. Resume then failed trying to create
a device with the same name.

- Make RemoveSnapshot failure a hard error in Pause (clean up remaining
  resources and return error instead of silently proceeding)
- Add defensive stale device cleanup in RestoreSnapshot before creating
  the new dm device
This commit is contained in:
2026-03-14 03:57:14 +06:00
parent c92cc29b88
commit 1846168736
2 changed files with 30 additions and 1 deletions

View File

@ -156,6 +156,15 @@ func CreateSnapshot(name, originLoopDev, cowPath string, originSizeBytes int64)
// The CoW file must have been created with the persistent (P) flag and still // The CoW file must have been created with the persistent (P) flag and still
// contain valid dm-snapshot metadata. // contain valid dm-snapshot metadata.
func RestoreSnapshot(name, originLoopDev, cowPath string, originSizeBytes int64) (*SnapshotDevice, error) { func RestoreSnapshot(name, originLoopDev, cowPath string, originSizeBytes int64) (*SnapshotDevice, error) {
// Defensively remove a stale device with the same name. This can happen
// if a previous pause failed to clean up the dm device (e.g. "device busy").
if dmDeviceExists(name) {
slog.Warn("removing stale dm device before restore", "name", name)
if err := dmsetupRemove(name); err != nil {
return nil, fmt.Errorf("remove stale device %s: %w", name, err)
}
}
cowLoopDev, err := losetupCreateRW(cowPath) cowLoopDev, err := losetupCreateRW(cowPath)
if err != nil { if err != nil {
return nil, fmt.Errorf("losetup cow: %w", err) return nil, fmt.Errorf("losetup cow: %w", err)
@ -293,6 +302,11 @@ func dmsetupCreate(name, originDev, cowDev string, sectors int64) error {
return nil return nil
} }
// dmDeviceExists checks whether a device-mapper device with the given name exists.
func dmDeviceExists(name string) bool {
return exec.Command("dmsetup", "info", name).Run() == nil
}
// dmsetupRemove removes a device-mapper device. // dmsetupRemove removes a device-mapper device.
func dmsetupRemove(name string) error { func dmsetupRemove(name string) error {
cmd := exec.Command("dmsetup", "remove", name) cmd := exec.Command("dmsetup", "remove", name)

View File

@ -364,7 +364,22 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error {
// Step 5: Now that FC is gone, safely remove the dm-snapshot and save the CoW. // Step 5: Now that FC is gone, safely remove the dm-snapshot and save the CoW.
if sb.dmDevice != nil { if sb.dmDevice != nil {
if err := devicemapper.RemoveSnapshot(sb.dmDevice); err != nil { if err := devicemapper.RemoveSnapshot(sb.dmDevice); err != nil {
warnErr("dm-snapshot remove error during pause", sandboxID, err) // Hard error: if the dm device isn't removed, the CoW file is still
// in use and we can't safely move it. The snapshot files from step 2-3
// are cleaned up, but the sandbox resources remain so the user can retry.
warnErr("network cleanup error during pause", sandboxID, network.RemoveNetwork(sb.slot))
m.slots.Release(sb.SlotIndex)
if sb.baseImagePath != "" {
m.loops.Release(sb.baseImagePath)
}
if sb.uffdSocketPath != "" {
os.Remove(sb.uffdSocketPath)
}
warnErr("snapshot dir cleanup error", sandboxID, snapshot.Remove(m.cfg.SnapshotsDir, sandboxID))
m.mu.Lock()
delete(m.boxes, sandboxID)
m.mu.Unlock()
return fmt.Errorf("remove dm-snapshot: %w", err)
} }
// Move (not copy) the CoW file into the snapshot directory. // Move (not copy) the CoW file into the snapshot directory.