Fix sandbox lifecycle cleanup and dmsetup remove reliability

- Add retry with backoff to dmsetupRemove for transient "device busy"
  errors caused by kernel not releasing the device immediately after
  Firecracker exits. Only retries on "Device or resource busy"; other
  errors (not found, permission denied) return immediately.

- Thread context.Context through RemoveSnapshot/RestoreSnapshot so
  retries respect cancellation. Use context.Background() in all error
  cleanup paths to prevent cancelled contexts from skipping cleanup
  and leaking dm devices on the host.

- Resume vCPUs on pause failure: if snapshot creation or memfile
  processing fails after freezing the VM, unfreeze vCPUs so the
  sandbox stays usable instead of becoming a frozen zombie.

- Fix resource leaks in Pause when CoW rename or metadata write fails:
  properly clean up network, slot, loop device, and remove from boxes
  map instead of leaving a dead sandbox with leaked host resources.

- Fix Resume WaitUntilReady failure: roll back CoW file to the snapshot
  directory instead of deleting it, preserving the paused state so the
  user can retry.

- Skip m.loops.Release when RemoveSnapshot fails during pause since
  the stale dm device still references the origin loop device.

- Fix incorrect VCPUs placeholder in Resume VMConfig that used memory
  size instead of a sensible default.
This commit is contained in:
2026-03-14 06:42:34 +06:00
parent 1846168736
commit 88246fac2b
2 changed files with 106 additions and 38 deletions

View File

@ -5,6 +5,7 @@
package devicemapper package devicemapper
import ( import (
"context"
"fmt" "fmt"
"log/slog" "log/slog"
"os" "os"
@ -12,6 +13,7 @@ import (
"strconv" "strconv"
"strings" "strings"
"sync" "sync"
"time"
) )
const ( const (
@ -155,12 +157,12 @@ func CreateSnapshot(name, originLoopDev, cowPath string, originSizeBytes int64)
// RestoreSnapshot re-attaches a dm-snapshot from an existing persistent CoW file. // RestoreSnapshot re-attaches a dm-snapshot from an existing persistent CoW file.
// The CoW file must have been created with the persistent (P) flag and still // The CoW file must have been created with the persistent (P) flag and still
// contain valid dm-snapshot metadata. // contain valid dm-snapshot metadata.
func RestoreSnapshot(name, originLoopDev, cowPath string, originSizeBytes int64) (*SnapshotDevice, error) { func RestoreSnapshot(ctx context.Context, name, originLoopDev, cowPath string, originSizeBytes int64) (*SnapshotDevice, error) {
// Defensively remove a stale device with the same name. This can happen // Defensively remove a stale device with the same name. This can happen
// if a previous pause failed to clean up the dm device (e.g. "device busy"). // if a previous pause failed to clean up the dm device (e.g. "device busy").
if dmDeviceExists(name) { if dmDeviceExists(name) {
slog.Warn("removing stale dm device before restore", "name", name) slog.Warn("removing stale dm device before restore", "name", name)
if err := dmsetupRemove(name); err != nil { if err := dmsetupRemove(ctx, name); err != nil {
return nil, fmt.Errorf("remove stale device %s: %w", name, err) return nil, fmt.Errorf("remove stale device %s: %w", name, err)
} }
} }
@ -197,8 +199,8 @@ func RestoreSnapshot(name, originLoopDev, cowPath string, originSizeBytes int64)
// RemoveSnapshot tears down a dm-snapshot device and its CoW loop device. // RemoveSnapshot tears down a dm-snapshot device and its CoW loop device.
// The CoW file is NOT deleted — the caller decides whether to keep or remove it. // The CoW file is NOT deleted — the caller decides whether to keep or remove it.
func RemoveSnapshot(dev *SnapshotDevice) error { func RemoveSnapshot(ctx context.Context, dev *SnapshotDevice) error {
if err := dmsetupRemove(dev.Name); err != nil { if err := dmsetupRemove(ctx, dev.Name); err != nil {
return fmt.Errorf("dmsetup remove %s: %w", dev.Name, err) return fmt.Errorf("dmsetup remove %s: %w", dev.Name, err)
} }
@ -259,7 +261,7 @@ func CleanupStaleDevices() {
} }
slog.Warn("removing stale dm-snapshot device", "name", name) slog.Warn("removing stale dm-snapshot device", "name", name)
if err := dmsetupRemove(name); err != nil { if err := dmsetupRemove(context.Background(), name); err != nil {
slog.Warn("failed to remove stale device", "name", name, "error", err) slog.Warn("failed to remove stale device", "name", name, "error", err)
} }
} }
@ -307,13 +309,40 @@ func dmDeviceExists(name string) bool {
return exec.Command("dmsetup", "info", name).Run() == nil return exec.Command("dmsetup", "info", name).Run() == nil
} }
// dmsetupRemove removes a device-mapper device. // dmsetupRemove removes a device-mapper device, retrying on transient
func dmsetupRemove(name string) error { // "device busy" errors that occur when the kernel hasn't fully released
cmd := exec.Command("dmsetup", "remove", name) // the device after a Firecracker process exits.
if out, err := cmd.CombinedOutput(); err != nil { func dmsetupRemove(ctx context.Context, name string) error {
return fmt.Errorf("%s: %w", strings.TrimSpace(string(out)), err) var lastErr error
for attempt := range 5 {
if attempt > 0 {
select {
case <-ctx.Done():
return fmt.Errorf("context cancelled while retrying dmsetup remove %s: %w", name, lastErr)
case <-time.After(200 * time.Millisecond):
}
}
cmd := exec.CommandContext(ctx, "dmsetup", "remove", name)
out, err := cmd.CombinedOutput()
if err == nil {
return nil
}
// If the context was cancelled, the process was killed and its
// output is unreliable. Return the context error directly so
// callers can distinguish cancellation from a real dm failure.
if ctx.Err() != nil {
return fmt.Errorf("dmsetup remove %s: %w", name, ctx.Err())
}
outStr := strings.TrimSpace(string(out))
lastErr = fmt.Errorf("%s: %w", outStr, err)
// Only retry on transient "busy" errors. Other failures
// (device not found, permission denied) are permanent.
if !strings.Contains(outStr, "Device or resource busy") {
return lastErr
}
slog.Debug("dmsetup remove retry", "name", name, "attempt", attempt+1, "error", lastErr)
} }
return nil return lastErr
} }
// createSparseFile creates a sparse file of the given size. // createSparseFile creates a sparse file of the given size.

View File

@ -139,7 +139,7 @@ func (m *Manager) Create(ctx context.Context, sandboxID, template string, vcpus,
// Allocate network slot. // Allocate network slot.
slotIdx, err := m.slots.Allocate() slotIdx, err := m.slots.Allocate()
if err != nil { if err != nil {
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev)) warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev))
os.Remove(cowPath) os.Remove(cowPath)
m.loops.Release(baseRootfs) m.loops.Release(baseRootfs)
return nil, fmt.Errorf("allocate network slot: %w", err) return nil, fmt.Errorf("allocate network slot: %w", err)
@ -149,7 +149,7 @@ func (m *Manager) Create(ctx context.Context, sandboxID, template string, vcpus,
// Set up network. // Set up network.
if err := network.CreateNetwork(slot); err != nil { if err := network.CreateNetwork(slot); err != nil {
m.slots.Release(slotIdx) m.slots.Release(slotIdx)
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev)) warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev))
os.Remove(cowPath) os.Remove(cowPath)
m.loops.Release(baseRootfs) m.loops.Release(baseRootfs)
return nil, fmt.Errorf("create network: %w", err) return nil, fmt.Errorf("create network: %w", err)
@ -173,7 +173,7 @@ func (m *Manager) Create(ctx context.Context, sandboxID, template string, vcpus,
if _, err := m.vm.Create(ctx, vmCfg); err != nil { if _, err := m.vm.Create(ctx, vmCfg); err != nil {
warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot)) warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot))
m.slots.Release(slotIdx) m.slots.Release(slotIdx)
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev)) warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev))
os.Remove(cowPath) os.Remove(cowPath)
m.loops.Release(baseRootfs) m.loops.Release(baseRootfs)
return nil, fmt.Errorf("create VM: %w", err) return nil, fmt.Errorf("create VM: %w", err)
@ -188,7 +188,7 @@ func (m *Manager) Create(ctx context.Context, sandboxID, template string, vcpus,
warnErr("vm destroy error", sandboxID, m.vm.Destroy(context.Background(), sandboxID)) warnErr("vm destroy error", sandboxID, m.vm.Destroy(context.Background(), sandboxID))
warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot)) warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot))
m.slots.Release(slotIdx) m.slots.Release(slotIdx)
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev)) warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev))
os.Remove(cowPath) os.Remove(cowPath)
m.loops.Release(baseRootfs) m.loops.Release(baseRootfs)
return nil, fmt.Errorf("wait for envd: %w", err) return nil, fmt.Errorf("wait for envd: %w", err)
@ -262,7 +262,7 @@ func (m *Manager) cleanup(ctx context.Context, sb *sandboxState) {
// Tear down dm-snapshot and release the base image loop device. // Tear down dm-snapshot and release the base image loop device.
if sb.dmDevice != nil { if sb.dmDevice != nil {
if err := devicemapper.RemoveSnapshot(sb.dmDevice); err != nil { if err := devicemapper.RemoveSnapshot(context.Background(), sb.dmDevice); err != nil {
slog.Warn("dm-snapshot remove error", "id", sb.ID, "error", err) slog.Warn("dm-snapshot remove error", "id", sb.ID, "error", err)
} }
os.Remove(sb.dmDevice.CowPath) os.Remove(sb.dmDevice.CowPath)
@ -304,8 +304,18 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error {
snapshotType = "Diff" snapshotType = "Diff"
} }
// resumeOnError unpauses the VM so the sandbox stays usable when a
// post-freeze step fails. If the resume itself fails, the sandbox is
// left frozen — the caller should destroy it.
resumeOnError := func() {
if err := m.vm.Resume(ctx, sandboxID); err != nil {
slog.Error("failed to resume VM after pause error — sandbox is frozen", "id", sandboxID, "error", err)
}
}
// Step 2: Take VM state snapshot (snapfile + memfile). // Step 2: Take VM state snapshot (snapfile + memfile).
if err := snapshot.EnsureDir(m.cfg.SnapshotsDir, sandboxID); err != nil { if err := snapshot.EnsureDir(m.cfg.SnapshotsDir, sandboxID); err != nil {
resumeOnError()
return fmt.Errorf("create snapshot dir: %w", err) return fmt.Errorf("create snapshot dir: %w", err)
} }
@ -316,6 +326,7 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error {
snapshotStart := time.Now() snapshotStart := time.Now()
if err := m.vm.Snapshot(ctx, sandboxID, snapPath, rawMemPath, snapshotType); err != nil { if err := m.vm.Snapshot(ctx, sandboxID, snapPath, rawMemPath, snapshotType); err != nil {
warnErr("snapshot dir cleanup error", sandboxID, snapshot.Remove(m.cfg.SnapshotsDir, sandboxID)) warnErr("snapshot dir cleanup error", sandboxID, snapshot.Remove(m.cfg.SnapshotsDir, sandboxID))
resumeOnError()
return fmt.Errorf("create VM snapshot: %w", err) return fmt.Errorf("create VM snapshot: %w", err)
} }
slog.Debug("pause: FC snapshot created", "id", sandboxID, "type", snapshotType, "elapsed", time.Since(snapshotStart)) slog.Debug("pause: FC snapshot created", "id", sandboxID, "type", snapshotType, "elapsed", time.Since(snapshotStart))
@ -330,6 +341,7 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error {
diffPath := snapshot.MemDiffPathForBuild(m.cfg.SnapshotsDir, sandboxID, buildID) diffPath := snapshot.MemDiffPathForBuild(m.cfg.SnapshotsDir, sandboxID, buildID)
if _, err := snapshot.ProcessMemfileWithParent(rawMemPath, diffPath, headerPath, sb.parent.header, buildID); err != nil { if _, err := snapshot.ProcessMemfileWithParent(rawMemPath, diffPath, headerPath, sb.parent.header, buildID); err != nil {
warnErr("snapshot dir cleanup error", sandboxID, snapshot.Remove(m.cfg.SnapshotsDir, sandboxID)) warnErr("snapshot dir cleanup error", sandboxID, snapshot.Remove(m.cfg.SnapshotsDir, sandboxID))
resumeOnError()
return fmt.Errorf("process memfile with parent: %w", err) return fmt.Errorf("process memfile with parent: %w", err)
} }
@ -339,6 +351,7 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error {
if prevPath != dstPath { if prevPath != dstPath {
if err := copyFile(prevPath, dstPath); err != nil { if err := copyFile(prevPath, dstPath); err != nil {
warnErr("snapshot dir cleanup error", sandboxID, snapshot.Remove(m.cfg.SnapshotsDir, sandboxID)) warnErr("snapshot dir cleanup error", sandboxID, snapshot.Remove(m.cfg.SnapshotsDir, sandboxID))
resumeOnError()
return fmt.Errorf("copy parent diff file: %w", err) return fmt.Errorf("copy parent diff file: %w", err)
} }
} }
@ -348,6 +361,7 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error {
diffPath := snapshot.MemDiffPath(m.cfg.SnapshotsDir, sandboxID) diffPath := snapshot.MemDiffPath(m.cfg.SnapshotsDir, sandboxID)
if _, err := snapshot.ProcessMemfile(rawMemPath, diffPath, headerPath, buildID); err != nil { if _, err := snapshot.ProcessMemfile(rawMemPath, diffPath, headerPath, buildID); err != nil {
warnErr("snapshot dir cleanup error", sandboxID, snapshot.Remove(m.cfg.SnapshotsDir, sandboxID)) warnErr("snapshot dir cleanup error", sandboxID, snapshot.Remove(m.cfg.SnapshotsDir, sandboxID))
resumeOnError()
return fmt.Errorf("process memfile: %w", err) return fmt.Errorf("process memfile: %w", err)
} }
} }
@ -363,15 +377,16 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error {
// Step 5: Now that FC is gone, safely remove the dm-snapshot and save the CoW. // Step 5: Now that FC is gone, safely remove the dm-snapshot and save the CoW.
if sb.dmDevice != nil { if sb.dmDevice != nil {
if err := devicemapper.RemoveSnapshot(sb.dmDevice); err != nil { if err := devicemapper.RemoveSnapshot(ctx, sb.dmDevice); err != nil {
// Hard error: if the dm device isn't removed, the CoW file is still // Hard error: if the dm device isn't removed, the CoW file is still
// in use and we can't safely move it. The snapshot files from step 2-3 // in use and we can't safely move it. The VM is already destroyed so
// are cleaned up, but the sandbox resources remain so the user can retry. // the sandbox is unrecoverable — clean up remaining resources.
// Note: we intentionally skip m.loops.Release here because the stale
// dm device still references the origin loop device. Detaching it now
// would corrupt the dm device. CleanupStaleDevices handles this on
// next agent startup.
warnErr("network cleanup error during pause", sandboxID, network.RemoveNetwork(sb.slot)) warnErr("network cleanup error during pause", sandboxID, network.RemoveNetwork(sb.slot))
m.slots.Release(sb.SlotIndex) m.slots.Release(sb.SlotIndex)
if sb.baseImagePath != "" {
m.loops.Release(sb.baseImagePath)
}
if sb.uffdSocketPath != "" { if sb.uffdSocketPath != "" {
os.Remove(sb.uffdSocketPath) os.Remove(sb.uffdSocketPath)
} }
@ -386,6 +401,18 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error {
snapshotCow := snapshot.CowPath(m.cfg.SnapshotsDir, sandboxID) snapshotCow := snapshot.CowPath(m.cfg.SnapshotsDir, sandboxID)
if err := os.Rename(sb.dmDevice.CowPath, snapshotCow); err != nil { if err := os.Rename(sb.dmDevice.CowPath, snapshotCow); err != nil {
warnErr("snapshot dir cleanup error", sandboxID, snapshot.Remove(m.cfg.SnapshotsDir, sandboxID)) warnErr("snapshot dir cleanup error", sandboxID, snapshot.Remove(m.cfg.SnapshotsDir, sandboxID))
// VM and dm-snapshot are already gone — clean up remaining resources.
warnErr("network cleanup error during pause", sandboxID, network.RemoveNetwork(sb.slot))
m.slots.Release(sb.SlotIndex)
if sb.baseImagePath != "" {
m.loops.Release(sb.baseImagePath)
}
if sb.uffdSocketPath != "" {
os.Remove(sb.uffdSocketPath)
}
m.mu.Lock()
delete(m.boxes, sandboxID)
m.mu.Unlock()
return fmt.Errorf("move cow file: %w", err) return fmt.Errorf("move cow file: %w", err)
} }
@ -394,6 +421,18 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error {
BaseTemplate: sb.baseImagePath, BaseTemplate: sb.baseImagePath,
}); err != nil { }); err != nil {
warnErr("snapshot dir cleanup error", sandboxID, snapshot.Remove(m.cfg.SnapshotsDir, sandboxID)) warnErr("snapshot dir cleanup error", sandboxID, snapshot.Remove(m.cfg.SnapshotsDir, sandboxID))
// VM and dm-snapshot are already gone — clean up remaining resources.
warnErr("network cleanup error during pause", sandboxID, network.RemoveNetwork(sb.slot))
m.slots.Release(sb.SlotIndex)
if sb.baseImagePath != "" {
m.loops.Release(sb.baseImagePath)
}
if sb.uffdSocketPath != "" {
os.Remove(sb.uffdSocketPath)
}
m.mu.Lock()
delete(m.boxes, sandboxID)
m.mu.Unlock()
return fmt.Errorf("write rootfs meta: %w", err) return fmt.Errorf("write rootfs meta: %w", err)
} }
} }
@ -489,7 +528,7 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string) (*models.Sandbox
// Restore dm-snapshot from existing persistent CoW file. // Restore dm-snapshot from existing persistent CoW file.
dmName := "wrenn-" + sandboxID dmName := "wrenn-" + sandboxID
dmDev, err := devicemapper.RestoreSnapshot(dmName, originLoop, cowPath, originSize) dmDev, err := devicemapper.RestoreSnapshot(ctx, dmName, originLoop, cowPath, originSize)
if err != nil { if err != nil {
source.Close() source.Close()
m.loops.Release(baseImagePath) m.loops.Release(baseImagePath)
@ -501,7 +540,7 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string) (*models.Sandbox
slotIdx, err := m.slots.Allocate() slotIdx, err := m.slots.Allocate()
if err != nil { if err != nil {
source.Close() source.Close()
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev)) warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev))
rollbackCow() rollbackCow()
m.loops.Release(baseImagePath) m.loops.Release(baseImagePath)
return nil, fmt.Errorf("allocate network slot: %w", err) return nil, fmt.Errorf("allocate network slot: %w", err)
@ -511,7 +550,7 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string) (*models.Sandbox
if err := network.CreateNetwork(slot); err != nil { if err := network.CreateNetwork(slot); err != nil {
source.Close() source.Close()
m.slots.Release(slotIdx) m.slots.Release(slotIdx)
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev)) warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev))
rollbackCow() rollbackCow()
m.loops.Release(baseImagePath) m.loops.Release(baseImagePath)
return nil, fmt.Errorf("create network: %w", err) return nil, fmt.Errorf("create network: %w", err)
@ -525,7 +564,7 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string) (*models.Sandbox
source.Close() source.Close()
warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot)) warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot))
m.slots.Release(slotIdx) m.slots.Release(slotIdx)
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev)) warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev))
rollbackCow() rollbackCow()
m.loops.Release(baseImagePath) m.loops.Release(baseImagePath)
return nil, fmt.Errorf("start uffd server: %w", err) return nil, fmt.Errorf("start uffd server: %w", err)
@ -536,8 +575,8 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string) (*models.Sandbox
SandboxID: sandboxID, SandboxID: sandboxID,
KernelPath: m.cfg.KernelPath, KernelPath: m.cfg.KernelPath,
RootfsPath: dmDev.DevicePath, RootfsPath: dmDev.DevicePath,
VCPUs: int(header.Metadata.Size / (1024 * 1024)), // Will be overridden by snapshot. VCPUs: 1, // Placeholder; overridden by snapshot.
MemoryMB: int(header.Metadata.Size / (1024 * 1024)), MemoryMB: int(header.Metadata.Size / (1024 * 1024)), // Placeholder; overridden by snapshot.
NetworkNamespace: slot.NamespaceID, NetworkNamespace: slot.NamespaceID,
TapDevice: slot.TapName, TapDevice: slot.TapName,
TapMAC: slot.TapMAC, TapMAC: slot.TapMAC,
@ -552,7 +591,7 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string) (*models.Sandbox
source.Close() source.Close()
warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot)) warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot))
m.slots.Release(slotIdx) m.slots.Release(slotIdx)
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev)) warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev))
rollbackCow() rollbackCow()
m.loops.Release(baseImagePath) m.loops.Release(baseImagePath)
return nil, fmt.Errorf("restore VM from snapshot: %w", err) return nil, fmt.Errorf("restore VM from snapshot: %w", err)
@ -569,8 +608,8 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string) (*models.Sandbox
warnErr("vm destroy error", sandboxID, m.vm.Destroy(context.Background(), sandboxID)) warnErr("vm destroy error", sandboxID, m.vm.Destroy(context.Background(), sandboxID))
warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot)) warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot))
m.slots.Release(slotIdx) m.slots.Release(slotIdx)
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev)) warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev))
os.Remove(cowPath) rollbackCow()
m.loops.Release(baseImagePath) m.loops.Release(baseImagePath)
return nil, fmt.Errorf("wait for envd: %w", err) return nil, fmt.Errorf("wait for envd: %w", err)
} }
@ -709,7 +748,7 @@ func (m *Manager) CreateSnapshot(ctx context.Context, sandboxID, name string) (i
// Temporarily restore the dm-snapshot to read the merged view. // Temporarily restore the dm-snapshot to read the merged view.
cowPath := snapshot.CowPath(m.cfg.SnapshotsDir, sandboxID) cowPath := snapshot.CowPath(m.cfg.SnapshotsDir, sandboxID)
tmpDmName := "wrenn-flatten-" + sandboxID tmpDmName := "wrenn-flatten-" + sandboxID
tmpDev, err := devicemapper.RestoreSnapshot(tmpDmName, originLoop, cowPath, originSize) tmpDev, err := devicemapper.RestoreSnapshot(ctx, tmpDmName, originLoop, cowPath, originSize)
if err != nil { if err != nil {
m.loops.Release(meta.BaseTemplate) m.loops.Release(meta.BaseTemplate)
warnErr("template dir cleanup error", name, snapshot.Remove(m.cfg.ImagesDir, name)) warnErr("template dir cleanup error", name, snapshot.Remove(m.cfg.ImagesDir, name))
@ -721,7 +760,7 @@ func (m *Manager) CreateSnapshot(ctx context.Context, sandboxID, name string) (i
flattenErr := devicemapper.FlattenSnapshot(tmpDev.DevicePath, flattenedPath) flattenErr := devicemapper.FlattenSnapshot(tmpDev.DevicePath, flattenedPath)
// Always clean up the temporary dm device. // Always clean up the temporary dm device.
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(tmpDev)) warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), tmpDev))
m.loops.Release(meta.BaseTemplate) m.loops.Release(meta.BaseTemplate)
if flattenErr != nil { if flattenErr != nil {
@ -810,7 +849,7 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID, snapshotNam
slotIdx, err := m.slots.Allocate() slotIdx, err := m.slots.Allocate()
if err != nil { if err != nil {
source.Close() source.Close()
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev)) warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev))
os.Remove(cowPath) os.Remove(cowPath)
m.loops.Release(baseRootfs) m.loops.Release(baseRootfs)
return nil, fmt.Errorf("allocate network slot: %w", err) return nil, fmt.Errorf("allocate network slot: %w", err)
@ -820,7 +859,7 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID, snapshotNam
if err := network.CreateNetwork(slot); err != nil { if err := network.CreateNetwork(slot); err != nil {
source.Close() source.Close()
m.slots.Release(slotIdx) m.slots.Release(slotIdx)
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev)) warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev))
os.Remove(cowPath) os.Remove(cowPath)
m.loops.Release(baseRootfs) m.loops.Release(baseRootfs)
return nil, fmt.Errorf("create network: %w", err) return nil, fmt.Errorf("create network: %w", err)
@ -834,7 +873,7 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID, snapshotNam
source.Close() source.Close()
warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot)) warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot))
m.slots.Release(slotIdx) m.slots.Release(slotIdx)
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev)) warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev))
os.Remove(cowPath) os.Remove(cowPath)
m.loops.Release(baseRootfs) m.loops.Release(baseRootfs)
return nil, fmt.Errorf("start uffd server: %w", err) return nil, fmt.Errorf("start uffd server: %w", err)
@ -861,7 +900,7 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID, snapshotNam
source.Close() source.Close()
warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot)) warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot))
m.slots.Release(slotIdx) m.slots.Release(slotIdx)
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev)) warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev))
os.Remove(cowPath) os.Remove(cowPath)
m.loops.Release(baseRootfs) m.loops.Release(baseRootfs)
return nil, fmt.Errorf("restore VM from snapshot: %w", err) return nil, fmt.Errorf("restore VM from snapshot: %w", err)
@ -878,7 +917,7 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID, snapshotNam
warnErr("vm destroy error", sandboxID, m.vm.Destroy(context.Background(), sandboxID)) warnErr("vm destroy error", sandboxID, m.vm.Destroy(context.Background(), sandboxID))
warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot)) warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot))
m.slots.Release(slotIdx) m.slots.Release(slotIdx)
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev)) warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev))
os.Remove(cowPath) os.Remove(cowPath)
m.loops.Release(baseRootfs) m.loops.Release(baseRootfs)
return nil, fmt.Errorf("wait for envd: %w", err) return nil, fmt.Errorf("wait for envd: %w", err)