Add diff snapshots for re-pause to avoid UFFD fault-in storm

Use Firecracker's Diff snapshot type when re-pausing a previously
resumed sandbox, capturing only dirty pages instead of a full memory
dump. Chains up to 10 incremental generations before collapsing back
to a Full snapshot. Multi-generation diff files (memfile.{buildID})
are supported alongside the legacy single-file format in resume,
template creation, and snapshot existence checks.
This commit is contained in:
2026-03-13 09:37:54 +06:00
parent a0d635ae5e
commit 80a99eec87
5 changed files with 181 additions and 42 deletions

View File

@ -50,8 +50,24 @@ type sandboxState struct {
uffdSocketPath string // non-empty for sandboxes restored from snapshot uffdSocketPath string // non-empty for sandboxes restored from snapshot
dmDevice *devicemapper.SnapshotDevice dmDevice *devicemapper.SnapshotDevice
baseImagePath string // path to the base template rootfs (for loop registry release) baseImagePath string // path to the base template rootfs (for loop registry release)
// parent holds the snapshot header and diff file paths from which this
// sandbox was restored. Non-nil means re-pause should use "Diff" snapshot
// type instead of "Full", avoiding the UFFD fault-in storm.
parent *snapshotParent
} }
// snapshotParent stores the previous generation's snapshot state so that
// re-pause can produce an incremental diff instead of a full memory dump.
type snapshotParent struct {
header *snapshot.Header
diffPaths map[string]string // build ID → file path
}
// maxDiffGenerations caps how many incremental diff generations we chain
// before falling back to a Full snapshot to collapse the chain.
const maxDiffGenerations = 10
// New creates a new sandbox manager. // New creates a new sandbox manager.
func New(cfg Config) *Manager { func New(cfg Config) *Manager {
if cfg.EnvdTimeout == 0 { if cfg.EnvdTimeout == 0 {
@ -281,7 +297,14 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error {
} }
slog.Debug("pause: VM paused", "id", sandboxID, "elapsed", time.Since(pauseStart)) slog.Debug("pause: VM paused", "id", sandboxID, "elapsed", time.Since(pauseStart))
// Step 2: Take VM state snapshot (snapfile + memfile) — CoW file is saved separately. // Determine snapshot type: Diff if resumed from snapshot (avoids UFFD
// fault-in storm), Full otherwise or if generation cap is reached.
snapshotType := "Full"
if sb.parent != nil && sb.parent.header.Metadata.Generation < maxDiffGenerations {
snapshotType = "Diff"
}
// Step 2: Take VM state snapshot (snapfile + memfile).
if err := snapshot.EnsureDir(m.cfg.SnapshotsDir, sandboxID); err != nil { if err := snapshot.EnsureDir(m.cfg.SnapshotsDir, sandboxID); err != nil {
return fmt.Errorf("create snapshot dir: %w", err) return fmt.Errorf("create snapshot dir: %w", err)
} }
@ -290,28 +313,47 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error {
rawMemPath := filepath.Join(snapDir, "memfile.raw") rawMemPath := filepath.Join(snapDir, "memfile.raw")
snapPath := snapshot.SnapPath(m.cfg.SnapshotsDir, sandboxID) snapPath := snapshot.SnapPath(m.cfg.SnapshotsDir, sandboxID)
// For UFFD-resumed sandboxes, FC must fault in ALL lazy-loaded pages to
// serialize memory — this is the main bottleneck on re-pause.
snapshotStart := time.Now() snapshotStart := time.Now()
if err := m.vm.Snapshot(ctx, sandboxID, snapPath, rawMemPath); err != nil { if err := m.vm.Snapshot(ctx, sandboxID, snapPath, rawMemPath, snapshotType); err != nil {
warnErr("snapshot dir cleanup error", sandboxID, snapshot.Remove(m.cfg.SnapshotsDir, sandboxID)) warnErr("snapshot dir cleanup error", sandboxID, snapshot.Remove(m.cfg.SnapshotsDir, sandboxID))
return fmt.Errorf("create VM snapshot: %w", err) return fmt.Errorf("create VM snapshot: %w", err)
} }
slog.Debug("pause: FC snapshot created", "id", sandboxID, "elapsed", time.Since(snapshotStart)) slog.Debug("pause: FC snapshot created", "id", sandboxID, "type", snapshotType, "elapsed", time.Since(snapshotStart))
// Step 3: Process the raw memfile into a compact diff + header. // Step 3: Process the raw memfile into a compact diff + header.
buildID := uuid.New() buildID := uuid.New()
diffPath := snapshot.MemDiffPath(m.cfg.SnapshotsDir, sandboxID)
headerPath := snapshot.MemHeaderPath(m.cfg.SnapshotsDir, sandboxID) headerPath := snapshot.MemHeaderPath(m.cfg.SnapshotsDir, sandboxID)
processStart := time.Now() processStart := time.Now()
if sb.parent != nil && snapshotType == "Diff" {
// Diff: process against parent header, producing only changed blocks.
diffPath := snapshot.MemDiffPathForBuild(m.cfg.SnapshotsDir, sandboxID, buildID)
if _, err := snapshot.ProcessMemfileWithParent(rawMemPath, diffPath, headerPath, sb.parent.header, buildID); err != nil {
warnErr("snapshot dir cleanup error", sandboxID, snapshot.Remove(m.cfg.SnapshotsDir, sandboxID))
return fmt.Errorf("process memfile with parent: %w", err)
}
// Copy previous generation diff files into the snapshot directory.
for prevBuildID, prevPath := range sb.parent.diffPaths {
dstPath := snapshot.MemDiffPathForBuild(m.cfg.SnapshotsDir, sandboxID, uuid.MustParse(prevBuildID))
if prevPath != dstPath {
if err := copyFile(prevPath, dstPath); err != nil {
warnErr("snapshot dir cleanup error", sandboxID, snapshot.Remove(m.cfg.SnapshotsDir, sandboxID))
return fmt.Errorf("copy parent diff file: %w", err)
}
}
}
} else {
// Full: first generation or generation cap reached — single diff file.
diffPath := snapshot.MemDiffPath(m.cfg.SnapshotsDir, sandboxID)
if _, err := snapshot.ProcessMemfile(rawMemPath, diffPath, headerPath, buildID); err != nil { if _, err := snapshot.ProcessMemfile(rawMemPath, diffPath, headerPath, buildID); err != nil {
warnErr("snapshot dir cleanup error", sandboxID, snapshot.Remove(m.cfg.SnapshotsDir, sandboxID)) warnErr("snapshot dir cleanup error", sandboxID, snapshot.Remove(m.cfg.SnapshotsDir, sandboxID))
return fmt.Errorf("process memfile: %w", err) return fmt.Errorf("process memfile: %w", err)
} }
slog.Debug("pause: memfile processed", "id", sandboxID, "elapsed", time.Since(processStart)) }
slog.Debug("pause: memfile processed", "id", sandboxID, "type", snapshotType, "elapsed", time.Since(processStart))
// Remove the raw memfile — we only keep the compact diff. // Remove the raw memfile — we only keep the compact diff(s).
os.Remove(rawMemPath) os.Remove(rawMemPath)
// Step 4: Destroy the VM first so Firecracker releases the dm device. // Step 4: Destroy the VM first so Firecracker releases the dm device.
@ -357,7 +399,7 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error {
delete(m.boxes, sandboxID) delete(m.boxes, sandboxID)
m.mu.Unlock() m.mu.Unlock()
slog.Info("sandbox paused", "id", sandboxID, "total_elapsed", time.Since(pauseStart)) slog.Info("sandbox paused", "id", sandboxID, "snapshot_type", snapshotType, "total_elapsed", time.Since(pauseStart))
return nil return nil
} }
@ -380,9 +422,10 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string) (*models.Sandbox
return nil, fmt.Errorf("deserialize header: %w", err) return nil, fmt.Errorf("deserialize header: %w", err)
} }
// Build diff file map (build ID → file path). // Build diff file map — supports both single-generation and multi-generation.
diffPaths := map[string]string{ diffPaths, err := snapshot.ListDiffFiles(snapDir, sandboxID, header)
header.Metadata.BuildID.String(): snapshot.MemDiffPath(snapDir, sandboxID), if err != nil {
return nil, fmt.Errorf("list diff files: %w", err)
} }
source, err := uffd.NewDiffFileSource(header, diffPaths) source, err := uffd.NewDiffFileSource(header, diffPaths)
@ -537,20 +580,26 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string) (*models.Sandbox
uffdSocketPath: uffdSocketPath, uffdSocketPath: uffdSocketPath,
dmDevice: dmDev, dmDevice: dmDev,
baseImagePath: baseImagePath, baseImagePath: baseImagePath,
// Preserve parent snapshot info so re-pause can use Diff snapshots.
parent: &snapshotParent{
header: header,
diffPaths: diffPaths,
},
} }
m.mu.Lock() m.mu.Lock()
m.boxes[sandboxID] = sb m.boxes[sandboxID] = sb
m.mu.Unlock() m.mu.Unlock()
// Clean up remaining snapshot files (snapfile, memfile, header, meta). // Don't delete snapshot dir — diff files are needed for re-pause.
// The CoW file was already moved out. // The CoW file was already moved out. The dir will be cleaned up
warnErr("snapshot cleanup error", sandboxID, snapshot.Remove(snapDir, sandboxID)) // on destroy or overwritten on re-pause.
slog.Info("sandbox resumed from snapshot", slog.Info("sandbox resumed from snapshot",
"id", sandboxID, "id", sandboxID,
"host_ip", slot.HostIP.String(), "host_ip", slot.HostIP.String(),
"dm_device", dmDev.DevicePath, "dm_device", dmDev.DevicePath,
"generation", header.Metadata.Generation,
) )
return &sb.Sandbox, nil return &sb.Sandbox, nil
@ -585,11 +634,11 @@ func (m *Manager) CreateSnapshot(ctx context.Context, sandboxID, name string) (i
return 0, fmt.Errorf("create template dir: %w", err) return 0, fmt.Errorf("create template dir: %w", err)
} }
// Copy VM snapshot and memory files. // Copy VM snapshot file and memory header.
srcDir := snapshot.DirPath(m.cfg.SnapshotsDir, sandboxID) srcDir := snapshot.DirPath(m.cfg.SnapshotsDir, sandboxID)
dstDir := snapshot.DirPath(m.cfg.ImagesDir, name) dstDir := snapshot.DirPath(m.cfg.ImagesDir, name)
for _, fname := range []string{snapshot.SnapFileName, snapshot.MemDiffName, snapshot.MemHeaderName} { for _, fname := range []string{snapshot.SnapFileName, snapshot.MemHeaderName} {
src := filepath.Join(srcDir, fname) src := filepath.Join(srcDir, fname)
dst := filepath.Join(dstDir, fname) dst := filepath.Join(dstDir, fname)
if err := copyFile(src, dst); err != nil { if err := copyFile(src, dst); err != nil {
@ -598,6 +647,30 @@ func (m *Manager) CreateSnapshot(ctx context.Context, sandboxID, name string) (i
} }
} }
// Copy all memory diff files referenced by the header (supports multi-generation).
headerData, err := os.ReadFile(filepath.Join(srcDir, snapshot.MemHeaderName))
if err != nil {
warnErr("template dir cleanup error", name, snapshot.Remove(m.cfg.ImagesDir, name))
return 0, fmt.Errorf("read header for template: %w", err)
}
srcHeader, err := snapshot.Deserialize(headerData)
if err != nil {
warnErr("template dir cleanup error", name, snapshot.Remove(m.cfg.ImagesDir, name))
return 0, fmt.Errorf("deserialize header for template: %w", err)
}
srcDiffPaths, err := snapshot.ListDiffFiles(m.cfg.SnapshotsDir, sandboxID, srcHeader)
if err != nil {
warnErr("template dir cleanup error", name, snapshot.Remove(m.cfg.ImagesDir, name))
return 0, fmt.Errorf("list diff files for template: %w", err)
}
for _, srcPath := range srcDiffPaths {
dstPath := filepath.Join(dstDir, filepath.Base(srcPath))
if err := copyFile(srcPath, dstPath); err != nil {
warnErr("template dir cleanup error", name, snapshot.Remove(m.cfg.ImagesDir, name))
return 0, fmt.Errorf("copy diff file %s: %w", filepath.Base(srcPath), err)
}
}
// Flatten rootfs: temporarily set up dm device from base + CoW, dd to new image. // Flatten rootfs: temporarily set up dm device from base + CoW, dd to new image.
meta, err := snapshot.ReadMeta(m.cfg.SnapshotsDir, sandboxID) meta, err := snapshot.ReadMeta(m.cfg.SnapshotsDir, sandboxID)
if err != nil { if err != nil {
@ -683,9 +756,10 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID, snapshotNam
// Snapshot determines memory size. // Snapshot determines memory size.
memoryMB := int(header.Metadata.Size / (1024 * 1024)) memoryMB := int(header.Metadata.Size / (1024 * 1024))
// Build diff file map. // Build diff file map — supports multi-generation templates.
diffPaths := map[string]string{ diffPaths, err := snapshot.ListDiffFiles(imagesDir, snapshotName, header)
header.Metadata.BuildID.String(): snapshot.MemDiffPath(imagesDir, snapshotName), if err != nil {
return nil, fmt.Errorf("list diff files: %w", err)
} }
source, err := uffd.NewDiffFileSource(header, diffPaths) source, err := uffd.NewDiffFileSource(header, diffPaths)
@ -815,6 +889,11 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID, snapshotNam
uffdSocketPath: uffdSocketPath, uffdSocketPath: uffdSocketPath,
dmDevice: dmDev, dmDevice: dmDev,
baseImagePath: baseRootfs, baseImagePath: baseRootfs,
// Template-spawned sandboxes also get diff re-pause support.
parent: &snapshotParent{
header: header,
diffPaths: diffPaths,
},
} }
m.mu.Lock() m.mu.Lock()

View File

@ -7,6 +7,8 @@ import (
"os" "os"
"path/filepath" "path/filepath"
"syscall" "syscall"
"github.com/google/uuid"
) )
const ( const (
@ -28,11 +30,17 @@ func SnapPath(baseDir, name string) string {
return filepath.Join(DirPath(baseDir, name), SnapFileName) return filepath.Join(DirPath(baseDir, name), SnapFileName)
} }
// MemDiffPath returns the path to the compact memory diff file. // MemDiffPath returns the path to the compact memory diff file (legacy single-generation).
func MemDiffPath(baseDir, name string) string { func MemDiffPath(baseDir, name string) string {
return filepath.Join(DirPath(baseDir, name), MemDiffName) return filepath.Join(DirPath(baseDir, name), MemDiffName)
} }
// MemDiffPathForBuild returns the path to a specific generation's diff file.
// Format: memfile.{buildID}
func MemDiffPathForBuild(baseDir, name string, buildID uuid.UUID) string {
return filepath.Join(DirPath(baseDir, name), fmt.Sprintf("memfile.%s", buildID.String()))
}
// MemHeaderPath returns the path to the memory mapping header file. // MemHeaderPath returns the path to the memory mapping header file.
func MemHeaderPath(baseDir, name string) string { func MemHeaderPath(baseDir, name string) string {
return filepath.Join(DirPath(baseDir, name), MemHeaderName) return filepath.Join(DirPath(baseDir, name), MemHeaderName)
@ -85,17 +93,38 @@ func ReadMeta(baseDir, name string) (*RootfsMeta, error) {
// Exists reports whether a complete snapshot exists (all required files present). // Exists reports whether a complete snapshot exists (all required files present).
// Supports both legacy (rootfs.ext4) and CoW-based (rootfs.cow + rootfs.meta) snapshots. // Supports both legacy (rootfs.ext4) and CoW-based (rootfs.cow + rootfs.meta) snapshots.
// Memory diff files can be either legacy "memfile" or generation-specific "memfile.{uuid}".
func Exists(baseDir, name string) bool { func Exists(baseDir, name string) bool {
dir := DirPath(baseDir, name) dir := DirPath(baseDir, name)
// Common files required by both formats. // snapfile and header are always required.
for _, f := range []string{SnapFileName, MemDiffName, MemHeaderName} { for _, f := range []string{SnapFileName, MemHeaderName} {
if _, err := os.Stat(filepath.Join(dir, f)); err != nil { if _, err := os.Stat(filepath.Join(dir, f)); err != nil {
return false return false
} }
} }
// Accept either rootfs.ext4 (legacy) or rootfs.cow + rootfs.meta (dm-snapshot). // Check that at least one memfile exists (legacy or generation-specific).
// We verify by reading the header and checking that referenced diff files exist.
// Fall back to checking for the legacy memfile name if header can't be read.
if _, err := os.Stat(filepath.Join(dir, MemDiffName)); err != nil {
// No legacy memfile — check if any memfile.{uuid} exists by
// looking for files matching the pattern.
matches, _ := filepath.Glob(filepath.Join(dir, "memfile.*"))
hasGenDiff := false
for _, m := range matches {
base := filepath.Base(m)
if base != MemHeaderName {
hasGenDiff = true
break
}
}
if !hasGenDiff {
return false
}
}
// Accept either rootfs.ext4 (legacy/template) or rootfs.cow + rootfs.meta (dm-snapshot).
if _, err := os.Stat(filepath.Join(dir, RootfsFileName)); err == nil { if _, err := os.Stat(filepath.Join(dir, RootfsFileName)); err == nil {
return true return true
} }
@ -127,6 +156,37 @@ func HasCow(baseDir, name string) bool {
return cowErr == nil && metaErr == nil return cowErr == nil && metaErr == nil
} }
// ListDiffFiles returns a map of build ID → file path for all memory diff files
// referenced by the given header. Handles both the legacy "memfile" name
// (single-generation) and generation-specific "memfile.{uuid}" names.
func ListDiffFiles(baseDir, name string, header *Header) (map[string]string, error) {
dir := DirPath(baseDir, name)
result := make(map[string]string)
for _, m := range header.Mapping {
if m.BuildID == uuid.Nil {
continue // zero-fill, no file needed
}
idStr := m.BuildID.String()
if _, exists := result[idStr]; exists {
continue
}
// Try generation-specific path first, fall back to legacy.
genPath := filepath.Join(dir, fmt.Sprintf("memfile.%s", idStr))
if _, err := os.Stat(genPath); err == nil {
result[idStr] = genPath
continue
}
legacyPath := filepath.Join(dir, MemDiffName)
if _, err := os.Stat(legacyPath); err == nil {
result[idStr] = legacyPath
continue
}
return nil, fmt.Errorf("diff file not found for build %s", idStr)
}
return result, nil
}
// EnsureDir creates the snapshot directory if it doesn't exist. // EnsureDir creates the snapshot directory if it doesn't exist.
func EnsureDir(baseDir, name string) error { func EnsureDir(baseDir, name string) error {
dir := DirPath(baseDir, name) dir := DirPath(baseDir, name)

View File

@ -123,7 +123,6 @@ func ProcessMemfileWithParent(memfilePath, diffPath, headerPath string, parentHe
totalBlocks := TotalBlocks(memSize, DefaultBlockSize) totalBlocks := TotalBlocks(memSize, DefaultBlockSize)
dirty := make([]bool, totalBlocks) dirty := make([]bool, totalBlocks)
empty := make([]bool, totalBlocks)
buf := make([]byte, DefaultBlockSize) buf := make([]byte, DefaultBlockSize)
for i := int64(0); i < totalBlocks; i++ { for i := int64(0); i < totalBlocks; i++ {
@ -139,7 +138,8 @@ func ProcessMemfileWithParent(memfilePath, diffPath, headerPath string, parentHe
} }
if isZeroBlock(buf) { if isZeroBlock(buf) {
empty[i] = true // For a diff memfile, zero blocks mean "not dirtied since resume" —
// they should inherit the parent's mapping, not be zero-filled.
continue continue
} }
@ -149,11 +149,10 @@ func ProcessMemfileWithParent(memfilePath, diffPath, headerPath string, parentHe
} }
} }
// Build new generation header merged with parent. // Only dirty blocks go into the diff overlay; MergeMappings preserves the
// parent's mapping for everything else.
dirtyMappings := CreateMapping(buildID, dirty, DefaultBlockSize) dirtyMappings := CreateMapping(buildID, dirty, DefaultBlockSize)
emptyMappings := CreateMapping(uuid.Nil, empty, DefaultBlockSize) merged := MergeMappings(parentHeader.Mapping, dirtyMappings)
diffMapping := MergeMappings(dirtyMappings, emptyMappings)
merged := MergeMappings(parentHeader.Mapping, diffMapping)
normalized := NormalizeMappings(merged) normalized := NormalizeMappings(merged)
metadata := parentHeader.Metadata.NextGeneration(buildID) metadata := parentHeader.Metadata.NextGeneration(buildID)

View File

@ -122,10 +122,11 @@ func (c *fcClient) resumeVM(ctx context.Context) error {
}) })
} }
// createSnapshot creates a full VM snapshot. // createSnapshot creates a VM snapshot.
func (c *fcClient) createSnapshot(ctx context.Context, snapPath, memPath string) error { // snapshotType is "Full" (all memory) or "Diff" (only dirty pages since last resume).
func (c *fcClient) createSnapshot(ctx context.Context, snapPath, memPath, snapshotType string) error {
return c.do(ctx, http.MethodPut, "/snapshot/create", map[string]any{ return c.do(ctx, http.MethodPut, "/snapshot/create", map[string]any{
"snapshot_type": "Full", "snapshot_type": snapshotType,
"snapshot_path": snapPath, "snapshot_path": snapPath,
"mem_file_path": memPath, "mem_file_path": memPath,
}) })

View File

@ -164,19 +164,19 @@ func (m *Manager) Destroy(ctx context.Context, sandboxID string) error {
return nil return nil
} }
// Snapshot creates a full VM snapshot. The VM must already be paused. // Snapshot creates a VM snapshot. The VM must already be paused.
// Produces a snapfile (VM state) and a memfile (full memory dump). // snapshotType is "Full" (all memory) or "Diff" (only dirty pages since last resume).
func (m *Manager) Snapshot(ctx context.Context, sandboxID, snapPath, memPath string) error { func (m *Manager) Snapshot(ctx context.Context, sandboxID, snapPath, memPath, snapshotType string) error {
vm, ok := m.vms[sandboxID] vm, ok := m.vms[sandboxID]
if !ok { if !ok {
return fmt.Errorf("VM not found: %s", sandboxID) return fmt.Errorf("VM not found: %s", sandboxID)
} }
if err := vm.client.createSnapshot(ctx, snapPath, memPath); err != nil { if err := vm.client.createSnapshot(ctx, snapPath, memPath, snapshotType); err != nil {
return fmt.Errorf("create snapshot: %w", err) return fmt.Errorf("create snapshot: %w", err)
} }
slog.Info("VM snapshot created", "sandbox", sandboxID, "snap_path", snapPath) slog.Info("VM snapshot created", "sandbox", sandboxID, "snap_path", snapPath, "type", snapshotType)
return nil return nil
} }