Add diff snapshots for re-pause to avoid UFFD fault-in storm
Use Firecracker's Diff snapshot type when re-pausing a previously
resumed sandbox, capturing only dirty pages instead of a full memory
dump. Chains up to 10 incremental generations before collapsing back
to a Full snapshot. Multi-generation diff files (memfile.{buildID})
are supported alongside the legacy single-file format in resume,
template creation, and snapshot existence checks.
This commit is contained in:
@ -50,8 +50,24 @@ type sandboxState struct {
|
||||
uffdSocketPath string // non-empty for sandboxes restored from snapshot
|
||||
dmDevice *devicemapper.SnapshotDevice
|
||||
baseImagePath string // path to the base template rootfs (for loop registry release)
|
||||
|
||||
// parent holds the snapshot header and diff file paths from which this
|
||||
// sandbox was restored. Non-nil means re-pause should use "Diff" snapshot
|
||||
// type instead of "Full", avoiding the UFFD fault-in storm.
|
||||
parent *snapshotParent
|
||||
}
|
||||
|
||||
// snapshotParent stores the previous generation's snapshot state so that
|
||||
// re-pause can produce an incremental diff instead of a full memory dump.
|
||||
type snapshotParent struct {
|
||||
header *snapshot.Header
|
||||
diffPaths map[string]string // build ID → file path
|
||||
}
|
||||
|
||||
// maxDiffGenerations caps how many incremental diff generations we chain
|
||||
// before falling back to a Full snapshot to collapse the chain.
|
||||
const maxDiffGenerations = 10
|
||||
|
||||
// New creates a new sandbox manager.
|
||||
func New(cfg Config) *Manager {
|
||||
if cfg.EnvdTimeout == 0 {
|
||||
@ -281,7 +297,14 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error {
|
||||
}
|
||||
slog.Debug("pause: VM paused", "id", sandboxID, "elapsed", time.Since(pauseStart))
|
||||
|
||||
// Step 2: Take VM state snapshot (snapfile + memfile) — CoW file is saved separately.
|
||||
// Determine snapshot type: Diff if resumed from snapshot (avoids UFFD
|
||||
// fault-in storm), Full otherwise or if generation cap is reached.
|
||||
snapshotType := "Full"
|
||||
if sb.parent != nil && sb.parent.header.Metadata.Generation < maxDiffGenerations {
|
||||
snapshotType = "Diff"
|
||||
}
|
||||
|
||||
// Step 2: Take VM state snapshot (snapfile + memfile).
|
||||
if err := snapshot.EnsureDir(m.cfg.SnapshotsDir, sandboxID); err != nil {
|
||||
return fmt.Errorf("create snapshot dir: %w", err)
|
||||
}
|
||||
@ -290,28 +313,47 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error {
|
||||
rawMemPath := filepath.Join(snapDir, "memfile.raw")
|
||||
snapPath := snapshot.SnapPath(m.cfg.SnapshotsDir, sandboxID)
|
||||
|
||||
// For UFFD-resumed sandboxes, FC must fault in ALL lazy-loaded pages to
|
||||
// serialize memory — this is the main bottleneck on re-pause.
|
||||
snapshotStart := time.Now()
|
||||
if err := m.vm.Snapshot(ctx, sandboxID, snapPath, rawMemPath); err != nil {
|
||||
if err := m.vm.Snapshot(ctx, sandboxID, snapPath, rawMemPath, snapshotType); err != nil {
|
||||
warnErr("snapshot dir cleanup error", sandboxID, snapshot.Remove(m.cfg.SnapshotsDir, sandboxID))
|
||||
return fmt.Errorf("create VM snapshot: %w", err)
|
||||
}
|
||||
slog.Debug("pause: FC snapshot created", "id", sandboxID, "elapsed", time.Since(snapshotStart))
|
||||
slog.Debug("pause: FC snapshot created", "id", sandboxID, "type", snapshotType, "elapsed", time.Since(snapshotStart))
|
||||
|
||||
// Step 3: Process the raw memfile into a compact diff + header.
|
||||
buildID := uuid.New()
|
||||
diffPath := snapshot.MemDiffPath(m.cfg.SnapshotsDir, sandboxID)
|
||||
headerPath := snapshot.MemHeaderPath(m.cfg.SnapshotsDir, sandboxID)
|
||||
|
||||
processStart := time.Now()
|
||||
if sb.parent != nil && snapshotType == "Diff" {
|
||||
// Diff: process against parent header, producing only changed blocks.
|
||||
diffPath := snapshot.MemDiffPathForBuild(m.cfg.SnapshotsDir, sandboxID, buildID)
|
||||
if _, err := snapshot.ProcessMemfileWithParent(rawMemPath, diffPath, headerPath, sb.parent.header, buildID); err != nil {
|
||||
warnErr("snapshot dir cleanup error", sandboxID, snapshot.Remove(m.cfg.SnapshotsDir, sandboxID))
|
||||
return fmt.Errorf("process memfile with parent: %w", err)
|
||||
}
|
||||
|
||||
// Copy previous generation diff files into the snapshot directory.
|
||||
for prevBuildID, prevPath := range sb.parent.diffPaths {
|
||||
dstPath := snapshot.MemDiffPathForBuild(m.cfg.SnapshotsDir, sandboxID, uuid.MustParse(prevBuildID))
|
||||
if prevPath != dstPath {
|
||||
if err := copyFile(prevPath, dstPath); err != nil {
|
||||
warnErr("snapshot dir cleanup error", sandboxID, snapshot.Remove(m.cfg.SnapshotsDir, sandboxID))
|
||||
return fmt.Errorf("copy parent diff file: %w", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Full: first generation or generation cap reached — single diff file.
|
||||
diffPath := snapshot.MemDiffPath(m.cfg.SnapshotsDir, sandboxID)
|
||||
if _, err := snapshot.ProcessMemfile(rawMemPath, diffPath, headerPath, buildID); err != nil {
|
||||
warnErr("snapshot dir cleanup error", sandboxID, snapshot.Remove(m.cfg.SnapshotsDir, sandboxID))
|
||||
return fmt.Errorf("process memfile: %w", err)
|
||||
}
|
||||
slog.Debug("pause: memfile processed", "id", sandboxID, "elapsed", time.Since(processStart))
|
||||
}
|
||||
slog.Debug("pause: memfile processed", "id", sandboxID, "type", snapshotType, "elapsed", time.Since(processStart))
|
||||
|
||||
// Remove the raw memfile — we only keep the compact diff.
|
||||
// Remove the raw memfile — we only keep the compact diff(s).
|
||||
os.Remove(rawMemPath)
|
||||
|
||||
// Step 4: Destroy the VM first so Firecracker releases the dm device.
|
||||
@ -357,7 +399,7 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error {
|
||||
delete(m.boxes, sandboxID)
|
||||
m.mu.Unlock()
|
||||
|
||||
slog.Info("sandbox paused", "id", sandboxID, "total_elapsed", time.Since(pauseStart))
|
||||
slog.Info("sandbox paused", "id", sandboxID, "snapshot_type", snapshotType, "total_elapsed", time.Since(pauseStart))
|
||||
return nil
|
||||
}
|
||||
|
||||
@ -380,9 +422,10 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string) (*models.Sandbox
|
||||
return nil, fmt.Errorf("deserialize header: %w", err)
|
||||
}
|
||||
|
||||
// Build diff file map (build ID → file path).
|
||||
diffPaths := map[string]string{
|
||||
header.Metadata.BuildID.String(): snapshot.MemDiffPath(snapDir, sandboxID),
|
||||
// Build diff file map — supports both single-generation and multi-generation.
|
||||
diffPaths, err := snapshot.ListDiffFiles(snapDir, sandboxID, header)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("list diff files: %w", err)
|
||||
}
|
||||
|
||||
source, err := uffd.NewDiffFileSource(header, diffPaths)
|
||||
@ -537,20 +580,26 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string) (*models.Sandbox
|
||||
uffdSocketPath: uffdSocketPath,
|
||||
dmDevice: dmDev,
|
||||
baseImagePath: baseImagePath,
|
||||
// Preserve parent snapshot info so re-pause can use Diff snapshots.
|
||||
parent: &snapshotParent{
|
||||
header: header,
|
||||
diffPaths: diffPaths,
|
||||
},
|
||||
}
|
||||
|
||||
m.mu.Lock()
|
||||
m.boxes[sandboxID] = sb
|
||||
m.mu.Unlock()
|
||||
|
||||
// Clean up remaining snapshot files (snapfile, memfile, header, meta).
|
||||
// The CoW file was already moved out.
|
||||
warnErr("snapshot cleanup error", sandboxID, snapshot.Remove(snapDir, sandboxID))
|
||||
// Don't delete snapshot dir — diff files are needed for re-pause.
|
||||
// The CoW file was already moved out. The dir will be cleaned up
|
||||
// on destroy or overwritten on re-pause.
|
||||
|
||||
slog.Info("sandbox resumed from snapshot",
|
||||
"id", sandboxID,
|
||||
"host_ip", slot.HostIP.String(),
|
||||
"dm_device", dmDev.DevicePath,
|
||||
"generation", header.Metadata.Generation,
|
||||
)
|
||||
|
||||
return &sb.Sandbox, nil
|
||||
@ -585,11 +634,11 @@ func (m *Manager) CreateSnapshot(ctx context.Context, sandboxID, name string) (i
|
||||
return 0, fmt.Errorf("create template dir: %w", err)
|
||||
}
|
||||
|
||||
// Copy VM snapshot and memory files.
|
||||
// Copy VM snapshot file and memory header.
|
||||
srcDir := snapshot.DirPath(m.cfg.SnapshotsDir, sandboxID)
|
||||
dstDir := snapshot.DirPath(m.cfg.ImagesDir, name)
|
||||
|
||||
for _, fname := range []string{snapshot.SnapFileName, snapshot.MemDiffName, snapshot.MemHeaderName} {
|
||||
for _, fname := range []string{snapshot.SnapFileName, snapshot.MemHeaderName} {
|
||||
src := filepath.Join(srcDir, fname)
|
||||
dst := filepath.Join(dstDir, fname)
|
||||
if err := copyFile(src, dst); err != nil {
|
||||
@ -598,6 +647,30 @@ func (m *Manager) CreateSnapshot(ctx context.Context, sandboxID, name string) (i
|
||||
}
|
||||
}
|
||||
|
||||
// Copy all memory diff files referenced by the header (supports multi-generation).
|
||||
headerData, err := os.ReadFile(filepath.Join(srcDir, snapshot.MemHeaderName))
|
||||
if err != nil {
|
||||
warnErr("template dir cleanup error", name, snapshot.Remove(m.cfg.ImagesDir, name))
|
||||
return 0, fmt.Errorf("read header for template: %w", err)
|
||||
}
|
||||
srcHeader, err := snapshot.Deserialize(headerData)
|
||||
if err != nil {
|
||||
warnErr("template dir cleanup error", name, snapshot.Remove(m.cfg.ImagesDir, name))
|
||||
return 0, fmt.Errorf("deserialize header for template: %w", err)
|
||||
}
|
||||
srcDiffPaths, err := snapshot.ListDiffFiles(m.cfg.SnapshotsDir, sandboxID, srcHeader)
|
||||
if err != nil {
|
||||
warnErr("template dir cleanup error", name, snapshot.Remove(m.cfg.ImagesDir, name))
|
||||
return 0, fmt.Errorf("list diff files for template: %w", err)
|
||||
}
|
||||
for _, srcPath := range srcDiffPaths {
|
||||
dstPath := filepath.Join(dstDir, filepath.Base(srcPath))
|
||||
if err := copyFile(srcPath, dstPath); err != nil {
|
||||
warnErr("template dir cleanup error", name, snapshot.Remove(m.cfg.ImagesDir, name))
|
||||
return 0, fmt.Errorf("copy diff file %s: %w", filepath.Base(srcPath), err)
|
||||
}
|
||||
}
|
||||
|
||||
// Flatten rootfs: temporarily set up dm device from base + CoW, dd to new image.
|
||||
meta, err := snapshot.ReadMeta(m.cfg.SnapshotsDir, sandboxID)
|
||||
if err != nil {
|
||||
@ -683,9 +756,10 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID, snapshotNam
|
||||
// Snapshot determines memory size.
|
||||
memoryMB := int(header.Metadata.Size / (1024 * 1024))
|
||||
|
||||
// Build diff file map.
|
||||
diffPaths := map[string]string{
|
||||
header.Metadata.BuildID.String(): snapshot.MemDiffPath(imagesDir, snapshotName),
|
||||
// Build diff file map — supports multi-generation templates.
|
||||
diffPaths, err := snapshot.ListDiffFiles(imagesDir, snapshotName, header)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("list diff files: %w", err)
|
||||
}
|
||||
|
||||
source, err := uffd.NewDiffFileSource(header, diffPaths)
|
||||
@ -815,6 +889,11 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID, snapshotNam
|
||||
uffdSocketPath: uffdSocketPath,
|
||||
dmDevice: dmDev,
|
||||
baseImagePath: baseRootfs,
|
||||
// Template-spawned sandboxes also get diff re-pause support.
|
||||
parent: &snapshotParent{
|
||||
header: header,
|
||||
diffPaths: diffPaths,
|
||||
},
|
||||
}
|
||||
|
||||
m.mu.Lock()
|
||||
|
||||
@ -7,6 +7,8 @@ import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"syscall"
|
||||
|
||||
"github.com/google/uuid"
|
||||
)
|
||||
|
||||
const (
|
||||
@ -28,11 +30,17 @@ func SnapPath(baseDir, name string) string {
|
||||
return filepath.Join(DirPath(baseDir, name), SnapFileName)
|
||||
}
|
||||
|
||||
// MemDiffPath returns the path to the compact memory diff file.
|
||||
// MemDiffPath returns the path to the compact memory diff file (legacy single-generation).
|
||||
func MemDiffPath(baseDir, name string) string {
|
||||
return filepath.Join(DirPath(baseDir, name), MemDiffName)
|
||||
}
|
||||
|
||||
// MemDiffPathForBuild returns the path to a specific generation's diff file.
|
||||
// Format: memfile.{buildID}
|
||||
func MemDiffPathForBuild(baseDir, name string, buildID uuid.UUID) string {
|
||||
return filepath.Join(DirPath(baseDir, name), fmt.Sprintf("memfile.%s", buildID.String()))
|
||||
}
|
||||
|
||||
// MemHeaderPath returns the path to the memory mapping header file.
|
||||
func MemHeaderPath(baseDir, name string) string {
|
||||
return filepath.Join(DirPath(baseDir, name), MemHeaderName)
|
||||
@ -85,17 +93,38 @@ func ReadMeta(baseDir, name string) (*RootfsMeta, error) {
|
||||
|
||||
// Exists reports whether a complete snapshot exists (all required files present).
|
||||
// Supports both legacy (rootfs.ext4) and CoW-based (rootfs.cow + rootfs.meta) snapshots.
|
||||
// Memory diff files can be either legacy "memfile" or generation-specific "memfile.{uuid}".
|
||||
func Exists(baseDir, name string) bool {
|
||||
dir := DirPath(baseDir, name)
|
||||
|
||||
// Common files required by both formats.
|
||||
for _, f := range []string{SnapFileName, MemDiffName, MemHeaderName} {
|
||||
// snapfile and header are always required.
|
||||
for _, f := range []string{SnapFileName, MemHeaderName} {
|
||||
if _, err := os.Stat(filepath.Join(dir, f)); err != nil {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// Accept either rootfs.ext4 (legacy) or rootfs.cow + rootfs.meta (dm-snapshot).
|
||||
// Check that at least one memfile exists (legacy or generation-specific).
|
||||
// We verify by reading the header and checking that referenced diff files exist.
|
||||
// Fall back to checking for the legacy memfile name if header can't be read.
|
||||
if _, err := os.Stat(filepath.Join(dir, MemDiffName)); err != nil {
|
||||
// No legacy memfile — check if any memfile.{uuid} exists by
|
||||
// looking for files matching the pattern.
|
||||
matches, _ := filepath.Glob(filepath.Join(dir, "memfile.*"))
|
||||
hasGenDiff := false
|
||||
for _, m := range matches {
|
||||
base := filepath.Base(m)
|
||||
if base != MemHeaderName {
|
||||
hasGenDiff = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !hasGenDiff {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// Accept either rootfs.ext4 (legacy/template) or rootfs.cow + rootfs.meta (dm-snapshot).
|
||||
if _, err := os.Stat(filepath.Join(dir, RootfsFileName)); err == nil {
|
||||
return true
|
||||
}
|
||||
@ -127,6 +156,37 @@ func HasCow(baseDir, name string) bool {
|
||||
return cowErr == nil && metaErr == nil
|
||||
}
|
||||
|
||||
// ListDiffFiles returns a map of build ID → file path for all memory diff files
|
||||
// referenced by the given header. Handles both the legacy "memfile" name
|
||||
// (single-generation) and generation-specific "memfile.{uuid}" names.
|
||||
func ListDiffFiles(baseDir, name string, header *Header) (map[string]string, error) {
|
||||
dir := DirPath(baseDir, name)
|
||||
result := make(map[string]string)
|
||||
|
||||
for _, m := range header.Mapping {
|
||||
if m.BuildID == uuid.Nil {
|
||||
continue // zero-fill, no file needed
|
||||
}
|
||||
idStr := m.BuildID.String()
|
||||
if _, exists := result[idStr]; exists {
|
||||
continue
|
||||
}
|
||||
// Try generation-specific path first, fall back to legacy.
|
||||
genPath := filepath.Join(dir, fmt.Sprintf("memfile.%s", idStr))
|
||||
if _, err := os.Stat(genPath); err == nil {
|
||||
result[idStr] = genPath
|
||||
continue
|
||||
}
|
||||
legacyPath := filepath.Join(dir, MemDiffName)
|
||||
if _, err := os.Stat(legacyPath); err == nil {
|
||||
result[idStr] = legacyPath
|
||||
continue
|
||||
}
|
||||
return nil, fmt.Errorf("diff file not found for build %s", idStr)
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// EnsureDir creates the snapshot directory if it doesn't exist.
|
||||
func EnsureDir(baseDir, name string) error {
|
||||
dir := DirPath(baseDir, name)
|
||||
|
||||
@ -123,7 +123,6 @@ func ProcessMemfileWithParent(memfilePath, diffPath, headerPath string, parentHe
|
||||
|
||||
totalBlocks := TotalBlocks(memSize, DefaultBlockSize)
|
||||
dirty := make([]bool, totalBlocks)
|
||||
empty := make([]bool, totalBlocks)
|
||||
buf := make([]byte, DefaultBlockSize)
|
||||
|
||||
for i := int64(0); i < totalBlocks; i++ {
|
||||
@ -139,7 +138,8 @@ func ProcessMemfileWithParent(memfilePath, diffPath, headerPath string, parentHe
|
||||
}
|
||||
|
||||
if isZeroBlock(buf) {
|
||||
empty[i] = true
|
||||
// For a diff memfile, zero blocks mean "not dirtied since resume" —
|
||||
// they should inherit the parent's mapping, not be zero-filled.
|
||||
continue
|
||||
}
|
||||
|
||||
@ -149,11 +149,10 @@ func ProcessMemfileWithParent(memfilePath, diffPath, headerPath string, parentHe
|
||||
}
|
||||
}
|
||||
|
||||
// Build new generation header merged with parent.
|
||||
// Only dirty blocks go into the diff overlay; MergeMappings preserves the
|
||||
// parent's mapping for everything else.
|
||||
dirtyMappings := CreateMapping(buildID, dirty, DefaultBlockSize)
|
||||
emptyMappings := CreateMapping(uuid.Nil, empty, DefaultBlockSize)
|
||||
diffMapping := MergeMappings(dirtyMappings, emptyMappings)
|
||||
merged := MergeMappings(parentHeader.Mapping, diffMapping)
|
||||
merged := MergeMappings(parentHeader.Mapping, dirtyMappings)
|
||||
normalized := NormalizeMappings(merged)
|
||||
|
||||
metadata := parentHeader.Metadata.NextGeneration(buildID)
|
||||
|
||||
@ -122,10 +122,11 @@ func (c *fcClient) resumeVM(ctx context.Context) error {
|
||||
})
|
||||
}
|
||||
|
||||
// createSnapshot creates a full VM snapshot.
|
||||
func (c *fcClient) createSnapshot(ctx context.Context, snapPath, memPath string) error {
|
||||
// createSnapshot creates a VM snapshot.
|
||||
// snapshotType is "Full" (all memory) or "Diff" (only dirty pages since last resume).
|
||||
func (c *fcClient) createSnapshot(ctx context.Context, snapPath, memPath, snapshotType string) error {
|
||||
return c.do(ctx, http.MethodPut, "/snapshot/create", map[string]any{
|
||||
"snapshot_type": "Full",
|
||||
"snapshot_type": snapshotType,
|
||||
"snapshot_path": snapPath,
|
||||
"mem_file_path": memPath,
|
||||
})
|
||||
|
||||
@ -164,19 +164,19 @@ func (m *Manager) Destroy(ctx context.Context, sandboxID string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Snapshot creates a full VM snapshot. The VM must already be paused.
|
||||
// Produces a snapfile (VM state) and a memfile (full memory dump).
|
||||
func (m *Manager) Snapshot(ctx context.Context, sandboxID, snapPath, memPath string) error {
|
||||
// Snapshot creates a VM snapshot. The VM must already be paused.
|
||||
// snapshotType is "Full" (all memory) or "Diff" (only dirty pages since last resume).
|
||||
func (m *Manager) Snapshot(ctx context.Context, sandboxID, snapPath, memPath, snapshotType string) error {
|
||||
vm, ok := m.vms[sandboxID]
|
||||
if !ok {
|
||||
return fmt.Errorf("VM not found: %s", sandboxID)
|
||||
}
|
||||
|
||||
if err := vm.client.createSnapshot(ctx, snapPath, memPath); err != nil {
|
||||
if err := vm.client.createSnapshot(ctx, snapPath, memPath, snapshotType); err != nil {
|
||||
return fmt.Errorf("create snapshot: %w", err)
|
||||
}
|
||||
|
||||
slog.Info("VM snapshot created", "sandbox", sandboxID, "snap_path", snapPath)
|
||||
slog.Info("VM snapshot created", "sandbox", sandboxID, "snap_path", snapPath, "type", snapshotType)
|
||||
return nil
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user