From 80a99eec87134e0e94570fa407be0dde53c852e8 Mon Sep 17 00:00:00 2001 From: pptx704 Date: Fri, 13 Mar 2026 09:37:54 +0600 Subject: [PATCH] Add diff snapshots for re-pause to avoid UFFD fault-in storm Use Firecracker's Diff snapshot type when re-pausing a previously resumed sandbox, capturing only dirty pages instead of a full memory dump. Chains up to 10 incremental generations before collapsing back to a Full snapshot. Multi-generation diff files (memfile.{buildID}) are supported alongside the legacy single-file format in resume, template creation, and snapshot existence checks. --- internal/sandbox/manager.go | 127 ++++++++++++++++++++++++++++------- internal/snapshot/local.go | 68 +++++++++++++++++-- internal/snapshot/memfile.go | 11 ++- internal/vm/fc.go | 7 +- internal/vm/manager.go | 10 +-- 5 files changed, 181 insertions(+), 42 deletions(-) diff --git a/internal/sandbox/manager.go b/internal/sandbox/manager.go index 5fd86d8..c030653 100644 --- a/internal/sandbox/manager.go +++ b/internal/sandbox/manager.go @@ -50,8 +50,24 @@ type sandboxState struct { uffdSocketPath string // non-empty for sandboxes restored from snapshot dmDevice *devicemapper.SnapshotDevice baseImagePath string // path to the base template rootfs (for loop registry release) + + // parent holds the snapshot header and diff file paths from which this + // sandbox was restored. Non-nil means re-pause should use "Diff" snapshot + // type instead of "Full", avoiding the UFFD fault-in storm. + parent *snapshotParent } +// snapshotParent stores the previous generation's snapshot state so that +// re-pause can produce an incremental diff instead of a full memory dump. +type snapshotParent struct { + header *snapshot.Header + diffPaths map[string]string // build ID → file path +} + +// maxDiffGenerations caps how many incremental diff generations we chain +// before falling back to a Full snapshot to collapse the chain. +const maxDiffGenerations = 10 + // New creates a new sandbox manager. func New(cfg Config) *Manager { if cfg.EnvdTimeout == 0 { @@ -281,7 +297,14 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error { } slog.Debug("pause: VM paused", "id", sandboxID, "elapsed", time.Since(pauseStart)) - // Step 2: Take VM state snapshot (snapfile + memfile) — CoW file is saved separately. + // Determine snapshot type: Diff if resumed from snapshot (avoids UFFD + // fault-in storm), Full otherwise or if generation cap is reached. + snapshotType := "Full" + if sb.parent != nil && sb.parent.header.Metadata.Generation < maxDiffGenerations { + snapshotType = "Diff" + } + + // Step 2: Take VM state snapshot (snapfile + memfile). if err := snapshot.EnsureDir(m.cfg.SnapshotsDir, sandboxID); err != nil { return fmt.Errorf("create snapshot dir: %w", err) } @@ -290,28 +313,47 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error { rawMemPath := filepath.Join(snapDir, "memfile.raw") snapPath := snapshot.SnapPath(m.cfg.SnapshotsDir, sandboxID) - // For UFFD-resumed sandboxes, FC must fault in ALL lazy-loaded pages to - // serialize memory — this is the main bottleneck on re-pause. snapshotStart := time.Now() - if err := m.vm.Snapshot(ctx, sandboxID, snapPath, rawMemPath); err != nil { + if err := m.vm.Snapshot(ctx, sandboxID, snapPath, rawMemPath, snapshotType); err != nil { warnErr("snapshot dir cleanup error", sandboxID, snapshot.Remove(m.cfg.SnapshotsDir, sandboxID)) return fmt.Errorf("create VM snapshot: %w", err) } - slog.Debug("pause: FC snapshot created", "id", sandboxID, "elapsed", time.Since(snapshotStart)) + slog.Debug("pause: FC snapshot created", "id", sandboxID, "type", snapshotType, "elapsed", time.Since(snapshotStart)) // Step 3: Process the raw memfile into a compact diff + header. buildID := uuid.New() - diffPath := snapshot.MemDiffPath(m.cfg.SnapshotsDir, sandboxID) headerPath := snapshot.MemHeaderPath(m.cfg.SnapshotsDir, sandboxID) processStart := time.Now() - if _, err := snapshot.ProcessMemfile(rawMemPath, diffPath, headerPath, buildID); err != nil { - warnErr("snapshot dir cleanup error", sandboxID, snapshot.Remove(m.cfg.SnapshotsDir, sandboxID)) - return fmt.Errorf("process memfile: %w", err) - } - slog.Debug("pause: memfile processed", "id", sandboxID, "elapsed", time.Since(processStart)) + if sb.parent != nil && snapshotType == "Diff" { + // Diff: process against parent header, producing only changed blocks. + diffPath := snapshot.MemDiffPathForBuild(m.cfg.SnapshotsDir, sandboxID, buildID) + if _, err := snapshot.ProcessMemfileWithParent(rawMemPath, diffPath, headerPath, sb.parent.header, buildID); err != nil { + warnErr("snapshot dir cleanup error", sandboxID, snapshot.Remove(m.cfg.SnapshotsDir, sandboxID)) + return fmt.Errorf("process memfile with parent: %w", err) + } - // Remove the raw memfile — we only keep the compact diff. + // Copy previous generation diff files into the snapshot directory. + for prevBuildID, prevPath := range sb.parent.diffPaths { + dstPath := snapshot.MemDiffPathForBuild(m.cfg.SnapshotsDir, sandboxID, uuid.MustParse(prevBuildID)) + if prevPath != dstPath { + if err := copyFile(prevPath, dstPath); err != nil { + warnErr("snapshot dir cleanup error", sandboxID, snapshot.Remove(m.cfg.SnapshotsDir, sandboxID)) + return fmt.Errorf("copy parent diff file: %w", err) + } + } + } + } else { + // Full: first generation or generation cap reached — single diff file. + diffPath := snapshot.MemDiffPath(m.cfg.SnapshotsDir, sandboxID) + if _, err := snapshot.ProcessMemfile(rawMemPath, diffPath, headerPath, buildID); err != nil { + warnErr("snapshot dir cleanup error", sandboxID, snapshot.Remove(m.cfg.SnapshotsDir, sandboxID)) + return fmt.Errorf("process memfile: %w", err) + } + } + slog.Debug("pause: memfile processed", "id", sandboxID, "type", snapshotType, "elapsed", time.Since(processStart)) + + // Remove the raw memfile — we only keep the compact diff(s). os.Remove(rawMemPath) // Step 4: Destroy the VM first so Firecracker releases the dm device. @@ -357,7 +399,7 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error { delete(m.boxes, sandboxID) m.mu.Unlock() - slog.Info("sandbox paused", "id", sandboxID, "total_elapsed", time.Since(pauseStart)) + slog.Info("sandbox paused", "id", sandboxID, "snapshot_type", snapshotType, "total_elapsed", time.Since(pauseStart)) return nil } @@ -380,9 +422,10 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string) (*models.Sandbox return nil, fmt.Errorf("deserialize header: %w", err) } - // Build diff file map (build ID → file path). - diffPaths := map[string]string{ - header.Metadata.BuildID.String(): snapshot.MemDiffPath(snapDir, sandboxID), + // Build diff file map — supports both single-generation and multi-generation. + diffPaths, err := snapshot.ListDiffFiles(snapDir, sandboxID, header) + if err != nil { + return nil, fmt.Errorf("list diff files: %w", err) } source, err := uffd.NewDiffFileSource(header, diffPaths) @@ -537,20 +580,26 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string) (*models.Sandbox uffdSocketPath: uffdSocketPath, dmDevice: dmDev, baseImagePath: baseImagePath, + // Preserve parent snapshot info so re-pause can use Diff snapshots. + parent: &snapshotParent{ + header: header, + diffPaths: diffPaths, + }, } m.mu.Lock() m.boxes[sandboxID] = sb m.mu.Unlock() - // Clean up remaining snapshot files (snapfile, memfile, header, meta). - // The CoW file was already moved out. - warnErr("snapshot cleanup error", sandboxID, snapshot.Remove(snapDir, sandboxID)) + // Don't delete snapshot dir — diff files are needed for re-pause. + // The CoW file was already moved out. The dir will be cleaned up + // on destroy or overwritten on re-pause. slog.Info("sandbox resumed from snapshot", "id", sandboxID, "host_ip", slot.HostIP.String(), "dm_device", dmDev.DevicePath, + "generation", header.Metadata.Generation, ) return &sb.Sandbox, nil @@ -585,11 +634,11 @@ func (m *Manager) CreateSnapshot(ctx context.Context, sandboxID, name string) (i return 0, fmt.Errorf("create template dir: %w", err) } - // Copy VM snapshot and memory files. + // Copy VM snapshot file and memory header. srcDir := snapshot.DirPath(m.cfg.SnapshotsDir, sandboxID) dstDir := snapshot.DirPath(m.cfg.ImagesDir, name) - for _, fname := range []string{snapshot.SnapFileName, snapshot.MemDiffName, snapshot.MemHeaderName} { + for _, fname := range []string{snapshot.SnapFileName, snapshot.MemHeaderName} { src := filepath.Join(srcDir, fname) dst := filepath.Join(dstDir, fname) if err := copyFile(src, dst); err != nil { @@ -598,6 +647,30 @@ func (m *Manager) CreateSnapshot(ctx context.Context, sandboxID, name string) (i } } + // Copy all memory diff files referenced by the header (supports multi-generation). + headerData, err := os.ReadFile(filepath.Join(srcDir, snapshot.MemHeaderName)) + if err != nil { + warnErr("template dir cleanup error", name, snapshot.Remove(m.cfg.ImagesDir, name)) + return 0, fmt.Errorf("read header for template: %w", err) + } + srcHeader, err := snapshot.Deserialize(headerData) + if err != nil { + warnErr("template dir cleanup error", name, snapshot.Remove(m.cfg.ImagesDir, name)) + return 0, fmt.Errorf("deserialize header for template: %w", err) + } + srcDiffPaths, err := snapshot.ListDiffFiles(m.cfg.SnapshotsDir, sandboxID, srcHeader) + if err != nil { + warnErr("template dir cleanup error", name, snapshot.Remove(m.cfg.ImagesDir, name)) + return 0, fmt.Errorf("list diff files for template: %w", err) + } + for _, srcPath := range srcDiffPaths { + dstPath := filepath.Join(dstDir, filepath.Base(srcPath)) + if err := copyFile(srcPath, dstPath); err != nil { + warnErr("template dir cleanup error", name, snapshot.Remove(m.cfg.ImagesDir, name)) + return 0, fmt.Errorf("copy diff file %s: %w", filepath.Base(srcPath), err) + } + } + // Flatten rootfs: temporarily set up dm device from base + CoW, dd to new image. meta, err := snapshot.ReadMeta(m.cfg.SnapshotsDir, sandboxID) if err != nil { @@ -683,9 +756,10 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID, snapshotNam // Snapshot determines memory size. memoryMB := int(header.Metadata.Size / (1024 * 1024)) - // Build diff file map. - diffPaths := map[string]string{ - header.Metadata.BuildID.String(): snapshot.MemDiffPath(imagesDir, snapshotName), + // Build diff file map — supports multi-generation templates. + diffPaths, err := snapshot.ListDiffFiles(imagesDir, snapshotName, header) + if err != nil { + return nil, fmt.Errorf("list diff files: %w", err) } source, err := uffd.NewDiffFileSource(header, diffPaths) @@ -815,6 +889,11 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID, snapshotNam uffdSocketPath: uffdSocketPath, dmDevice: dmDev, baseImagePath: baseRootfs, + // Template-spawned sandboxes also get diff re-pause support. + parent: &snapshotParent{ + header: header, + diffPaths: diffPaths, + }, } m.mu.Lock() diff --git a/internal/snapshot/local.go b/internal/snapshot/local.go index 7a7e750..8e667b8 100644 --- a/internal/snapshot/local.go +++ b/internal/snapshot/local.go @@ -7,6 +7,8 @@ import ( "os" "path/filepath" "syscall" + + "github.com/google/uuid" ) const ( @@ -28,11 +30,17 @@ func SnapPath(baseDir, name string) string { return filepath.Join(DirPath(baseDir, name), SnapFileName) } -// MemDiffPath returns the path to the compact memory diff file. +// MemDiffPath returns the path to the compact memory diff file (legacy single-generation). func MemDiffPath(baseDir, name string) string { return filepath.Join(DirPath(baseDir, name), MemDiffName) } +// MemDiffPathForBuild returns the path to a specific generation's diff file. +// Format: memfile.{buildID} +func MemDiffPathForBuild(baseDir, name string, buildID uuid.UUID) string { + return filepath.Join(DirPath(baseDir, name), fmt.Sprintf("memfile.%s", buildID.String())) +} + // MemHeaderPath returns the path to the memory mapping header file. func MemHeaderPath(baseDir, name string) string { return filepath.Join(DirPath(baseDir, name), MemHeaderName) @@ -85,17 +93,38 @@ func ReadMeta(baseDir, name string) (*RootfsMeta, error) { // Exists reports whether a complete snapshot exists (all required files present). // Supports both legacy (rootfs.ext4) and CoW-based (rootfs.cow + rootfs.meta) snapshots. +// Memory diff files can be either legacy "memfile" or generation-specific "memfile.{uuid}". func Exists(baseDir, name string) bool { dir := DirPath(baseDir, name) - // Common files required by both formats. - for _, f := range []string{SnapFileName, MemDiffName, MemHeaderName} { + // snapfile and header are always required. + for _, f := range []string{SnapFileName, MemHeaderName} { if _, err := os.Stat(filepath.Join(dir, f)); err != nil { return false } } - // Accept either rootfs.ext4 (legacy) or rootfs.cow + rootfs.meta (dm-snapshot). + // Check that at least one memfile exists (legacy or generation-specific). + // We verify by reading the header and checking that referenced diff files exist. + // Fall back to checking for the legacy memfile name if header can't be read. + if _, err := os.Stat(filepath.Join(dir, MemDiffName)); err != nil { + // No legacy memfile — check if any memfile.{uuid} exists by + // looking for files matching the pattern. + matches, _ := filepath.Glob(filepath.Join(dir, "memfile.*")) + hasGenDiff := false + for _, m := range matches { + base := filepath.Base(m) + if base != MemHeaderName { + hasGenDiff = true + break + } + } + if !hasGenDiff { + return false + } + } + + // Accept either rootfs.ext4 (legacy/template) or rootfs.cow + rootfs.meta (dm-snapshot). if _, err := os.Stat(filepath.Join(dir, RootfsFileName)); err == nil { return true } @@ -127,6 +156,37 @@ func HasCow(baseDir, name string) bool { return cowErr == nil && metaErr == nil } +// ListDiffFiles returns a map of build ID → file path for all memory diff files +// referenced by the given header. Handles both the legacy "memfile" name +// (single-generation) and generation-specific "memfile.{uuid}" names. +func ListDiffFiles(baseDir, name string, header *Header) (map[string]string, error) { + dir := DirPath(baseDir, name) + result := make(map[string]string) + + for _, m := range header.Mapping { + if m.BuildID == uuid.Nil { + continue // zero-fill, no file needed + } + idStr := m.BuildID.String() + if _, exists := result[idStr]; exists { + continue + } + // Try generation-specific path first, fall back to legacy. + genPath := filepath.Join(dir, fmt.Sprintf("memfile.%s", idStr)) + if _, err := os.Stat(genPath); err == nil { + result[idStr] = genPath + continue + } + legacyPath := filepath.Join(dir, MemDiffName) + if _, err := os.Stat(legacyPath); err == nil { + result[idStr] = legacyPath + continue + } + return nil, fmt.Errorf("diff file not found for build %s", idStr) + } + return result, nil +} + // EnsureDir creates the snapshot directory if it doesn't exist. func EnsureDir(baseDir, name string) error { dir := DirPath(baseDir, name) diff --git a/internal/snapshot/memfile.go b/internal/snapshot/memfile.go index a0bbc95..aabe885 100644 --- a/internal/snapshot/memfile.go +++ b/internal/snapshot/memfile.go @@ -123,7 +123,6 @@ func ProcessMemfileWithParent(memfilePath, diffPath, headerPath string, parentHe totalBlocks := TotalBlocks(memSize, DefaultBlockSize) dirty := make([]bool, totalBlocks) - empty := make([]bool, totalBlocks) buf := make([]byte, DefaultBlockSize) for i := int64(0); i < totalBlocks; i++ { @@ -139,7 +138,8 @@ func ProcessMemfileWithParent(memfilePath, diffPath, headerPath string, parentHe } if isZeroBlock(buf) { - empty[i] = true + // For a diff memfile, zero blocks mean "not dirtied since resume" — + // they should inherit the parent's mapping, not be zero-filled. continue } @@ -149,11 +149,10 @@ func ProcessMemfileWithParent(memfilePath, diffPath, headerPath string, parentHe } } - // Build new generation header merged with parent. + // Only dirty blocks go into the diff overlay; MergeMappings preserves the + // parent's mapping for everything else. dirtyMappings := CreateMapping(buildID, dirty, DefaultBlockSize) - emptyMappings := CreateMapping(uuid.Nil, empty, DefaultBlockSize) - diffMapping := MergeMappings(dirtyMappings, emptyMappings) - merged := MergeMappings(parentHeader.Mapping, diffMapping) + merged := MergeMappings(parentHeader.Mapping, dirtyMappings) normalized := NormalizeMappings(merged) metadata := parentHeader.Metadata.NextGeneration(buildID) diff --git a/internal/vm/fc.go b/internal/vm/fc.go index fccde2f..b5af5db 100644 --- a/internal/vm/fc.go +++ b/internal/vm/fc.go @@ -122,10 +122,11 @@ func (c *fcClient) resumeVM(ctx context.Context) error { }) } -// createSnapshot creates a full VM snapshot. -func (c *fcClient) createSnapshot(ctx context.Context, snapPath, memPath string) error { +// createSnapshot creates a VM snapshot. +// snapshotType is "Full" (all memory) or "Diff" (only dirty pages since last resume). +func (c *fcClient) createSnapshot(ctx context.Context, snapPath, memPath, snapshotType string) error { return c.do(ctx, http.MethodPut, "/snapshot/create", map[string]any{ - "snapshot_type": "Full", + "snapshot_type": snapshotType, "snapshot_path": snapPath, "mem_file_path": memPath, }) diff --git a/internal/vm/manager.go b/internal/vm/manager.go index 5e107ea..b68bde1 100644 --- a/internal/vm/manager.go +++ b/internal/vm/manager.go @@ -164,19 +164,19 @@ func (m *Manager) Destroy(ctx context.Context, sandboxID string) error { return nil } -// Snapshot creates a full VM snapshot. The VM must already be paused. -// Produces a snapfile (VM state) and a memfile (full memory dump). -func (m *Manager) Snapshot(ctx context.Context, sandboxID, snapPath, memPath string) error { +// Snapshot creates a VM snapshot. The VM must already be paused. +// snapshotType is "Full" (all memory) or "Diff" (only dirty pages since last resume). +func (m *Manager) Snapshot(ctx context.Context, sandboxID, snapPath, memPath, snapshotType string) error { vm, ok := m.vms[sandboxID] if !ok { return fmt.Errorf("VM not found: %s", sandboxID) } - if err := vm.client.createSnapshot(ctx, snapPath, memPath); err != nil { + if err := vm.client.createSnapshot(ctx, snapPath, memPath, snapshotType); err != nil { return fmt.Errorf("create snapshot: %w", err) } - slog.Info("VM snapshot created", "sandbox", sandboxID, "snap_path", snapPath) + slog.Info("VM snapshot created", "sandbox", sandboxID, "snap_path", snapPath, "type", snapshotType) return nil }