Add sandbox snapshot and restore with UFFD lazy memory loading

Implement full snapshot lifecycle: pause (snapshot + free resources),
resume (UFFD-based lazy restore), and named snapshot templates that
can spawn new sandboxes from frozen VM state.

Key changes:
- Snapshot header system with generational diff mapping (inspired by e2b)
- UFFD server for lazy page fault handling during snapshot restore
- Stable rootfs symlink path (/tmp/fc-vm/) for snapshot compatibility
- Templates DB table and CRUD API endpoints (POST/GET/DELETE /v1/snapshots)
- CreateSnapshot/DeleteSnapshot RPCs in hostagent proto
- Reconciler excludes paused sandboxes (expected absent from host agent)
- Snapshot templates lock vcpus/memory to baked-in values
- Proper cleanup of uffd sockets and pause snapshot files on destroy
This commit is contained in:
2026-03-12 09:19:37 +06:00
parent 9b94df7f56
commit a1bd439c75
33 changed files with 2714 additions and 166 deletions

220
internal/snapshot/header.go Normal file
View File

@ -0,0 +1,220 @@
// Package snapshot implements snapshot storage, header-based memory mapping,
// and memory file processing for Firecracker VM snapshots.
//
// The header system implements a generational copy-on-write memory mapping.
// Each snapshot generation stores only the blocks that changed since the
// previous generation. A Header contains a sorted list of BuildMap entries
// that together cover the entire memory address space, with each entry
// pointing to a specific generation's diff file.
//
// Inspired by e2b's snapshot system (Apache 2.0, modified by Omukk).
package snapshot
import (
"bytes"
"context"
"encoding/binary"
"errors"
"fmt"
"io"
"github.com/google/uuid"
)
const metadataVersion = 1
// Metadata is the fixed-size header prefix describing the snapshot memory layout.
// Binary layout (little-endian, 64 bytes total):
//
// Version uint64 (8 bytes)
// BlockSize uint64 (8 bytes)
// Size uint64 (8 bytes) — total memory size in bytes
// Generation uint64 (8 bytes)
// BuildID [16]byte (UUID)
// BaseBuildID [16]byte (UUID)
type Metadata struct {
Version uint64
BlockSize uint64
Size uint64
Generation uint64
BuildID uuid.UUID
BaseBuildID uuid.UUID
}
// NewMetadata creates metadata for a first-generation snapshot.
func NewMetadata(buildID uuid.UUID, blockSize, size uint64) *Metadata {
return &Metadata{
Version: metadataVersion,
Generation: 0,
BlockSize: blockSize,
Size: size,
BuildID: buildID,
BaseBuildID: buildID,
}
}
// NextGeneration creates metadata for the next generation in the chain.
func (m *Metadata) NextGeneration(buildID uuid.UUID) *Metadata {
return &Metadata{
Version: m.Version,
Generation: m.Generation + 1,
BlockSize: m.BlockSize,
Size: m.Size,
BuildID: buildID,
BaseBuildID: m.BaseBuildID,
}
}
// BuildMap maps a contiguous range of the memory address space to a specific
// generation's diff file. Binary layout (little-endian, 40 bytes):
//
// Offset uint64 — byte offset in the virtual address space
// Length uint64 — byte count (multiple of BlockSize)
// BuildID [16]byte — which generation's diff file, uuid.Nil = zero-fill
// BuildStorageOffset uint64 — byte offset within that generation's diff file
type BuildMap struct {
Offset uint64
Length uint64
BuildID uuid.UUID
BuildStorageOffset uint64
}
// Header is the in-memory representation of a snapshot's memory mapping.
// It provides O(log N) lookup from any memory offset to the correct
// generation's diff file and offset within it.
type Header struct {
Metadata *Metadata
Mapping []*BuildMap
// blockStarts tracks which block indices start a new BuildMap entry.
// startMap provides direct access from block index to the BuildMap.
blockStarts []bool
startMap map[int64]*BuildMap
}
// NewHeader creates a Header from metadata and mapping entries.
// If mapping is nil/empty, a single entry covering the full size is created.
func NewHeader(metadata *Metadata, mapping []*BuildMap) (*Header, error) {
if metadata.BlockSize == 0 {
return nil, fmt.Errorf("block size cannot be zero")
}
if len(mapping) == 0 {
mapping = []*BuildMap{{
Offset: 0,
Length: metadata.Size,
BuildID: metadata.BuildID,
BuildStorageOffset: 0,
}}
}
blocks := TotalBlocks(int64(metadata.Size), int64(metadata.BlockSize))
starts := make([]bool, blocks)
startMap := make(map[int64]*BuildMap, len(mapping))
for _, m := range mapping {
idx := BlockIdx(int64(m.Offset), int64(metadata.BlockSize))
if idx >= 0 && idx < blocks {
starts[idx] = true
startMap[idx] = m
}
}
return &Header{
Metadata: metadata,
Mapping: mapping,
blockStarts: starts,
startMap: startMap,
}, nil
}
// GetShiftedMapping resolves a memory offset to the corresponding diff file
// offset, remaining length, and build ID. This is the hot path called for
// every UFFD page fault.
func (h *Header) GetShiftedMapping(_ context.Context, offset int64) (mappedOffset int64, mappedLength int64, buildID *uuid.UUID, err error) {
if offset < 0 || offset >= int64(h.Metadata.Size) {
return 0, 0, nil, fmt.Errorf("offset %d out of bounds (size: %d)", offset, h.Metadata.Size)
}
blockSize := int64(h.Metadata.BlockSize)
block := BlockIdx(offset, blockSize)
// Walk backwards to find the BuildMap that contains this block.
start := block
for start >= 0 {
if h.blockStarts[start] {
break
}
start--
}
if start < 0 {
return 0, 0, nil, fmt.Errorf("no mapping found for offset %d", offset)
}
m, ok := h.startMap[start]
if !ok {
return 0, 0, nil, fmt.Errorf("no mapping at block %d", start)
}
shift := (block - start) * blockSize
if shift >= int64(m.Length) {
return 0, 0, nil, fmt.Errorf("offset %d beyond mapping end (mapping offset=%d, length=%d)", offset, m.Offset, m.Length)
}
return int64(m.BuildStorageOffset) + shift, int64(m.Length) - shift, &m.BuildID, nil
}
// Serialize writes metadata + mapping entries to binary (little-endian).
func Serialize(metadata *Metadata, mappings []*BuildMap) ([]byte, error) {
var buf bytes.Buffer
if err := binary.Write(&buf, binary.LittleEndian, metadata); err != nil {
return nil, fmt.Errorf("write metadata: %w", err)
}
for _, m := range mappings {
if err := binary.Write(&buf, binary.LittleEndian, m); err != nil {
return nil, fmt.Errorf("write mapping: %w", err)
}
}
return buf.Bytes(), nil
}
// Deserialize reads a header from binary data.
func Deserialize(data []byte) (*Header, error) {
reader := bytes.NewReader(data)
var metadata Metadata
if err := binary.Read(reader, binary.LittleEndian, &metadata); err != nil {
return nil, fmt.Errorf("read metadata: %w", err)
}
var mappings []*BuildMap
for {
var m BuildMap
if err := binary.Read(reader, binary.LittleEndian, &m); err != nil {
if errors.Is(err, io.EOF) {
break
}
return nil, fmt.Errorf("read mapping: %w", err)
}
mappings = append(mappings, &m)
}
return NewHeader(&metadata, mappings)
}
// Block index helpers.
func TotalBlocks(size, blockSize int64) int64 {
return (size + blockSize - 1) / blockSize
}
func BlockIdx(offset, blockSize int64) int64 {
return offset / blockSize
}
func BlockOffset(idx, blockSize int64) int64 {
return idx * blockSize
}

View File

@ -1 +1,101 @@
package snapshot
import (
"fmt"
"io/fs"
"os"
"path/filepath"
)
const (
SnapFileName = "snapfile"
MemDiffName = "memfile"
MemHeaderName = "memfile.header"
RootfsFileName = "rootfs.ext4"
)
// DirPath returns the snapshot directory for a given name.
func DirPath(baseDir, name string) string {
return filepath.Join(baseDir, name)
}
// SnapPath returns the path to the VM state snapshot file.
func SnapPath(baseDir, name string) string {
return filepath.Join(DirPath(baseDir, name), SnapFileName)
}
// MemDiffPath returns the path to the compact memory diff file.
func MemDiffPath(baseDir, name string) string {
return filepath.Join(DirPath(baseDir, name), MemDiffName)
}
// MemHeaderPath returns the path to the memory mapping header file.
func MemHeaderPath(baseDir, name string) string {
return filepath.Join(DirPath(baseDir, name), MemHeaderName)
}
// RootfsPath returns the path to the rootfs image.
func RootfsPath(baseDir, name string) string {
return filepath.Join(DirPath(baseDir, name), RootfsFileName)
}
// Exists reports whether a complete snapshot exists (all required files present).
func Exists(baseDir, name string) bool {
dir := DirPath(baseDir, name)
for _, f := range []string{SnapFileName, MemDiffName, MemHeaderName, RootfsFileName} {
if _, err := os.Stat(filepath.Join(dir, f)); err != nil {
return false
}
}
return true
}
// IsTemplate reports whether a template image directory exists (has rootfs.ext4).
func IsTemplate(baseDir, name string) bool {
_, err := os.Stat(filepath.Join(DirPath(baseDir, name), RootfsFileName))
return err == nil
}
// IsSnapshot reports whether a directory is a snapshot (has all snapshot files).
func IsSnapshot(baseDir, name string) bool {
return Exists(baseDir, name)
}
// EnsureDir creates the snapshot directory if it doesn't exist.
func EnsureDir(baseDir, name string) error {
dir := DirPath(baseDir, name)
if err := os.MkdirAll(dir, 0755); err != nil {
return fmt.Errorf("create snapshot dir %s: %w", dir, err)
}
return nil
}
// Remove deletes the entire snapshot directory.
func Remove(baseDir, name string) error {
return os.RemoveAll(DirPath(baseDir, name))
}
// DirSize returns the total byte size of all files in the snapshot directory.
func DirSize(baseDir, name string) (int64, error) {
var total int64
dir := DirPath(baseDir, name)
err := filepath.WalkDir(dir, func(_ string, d fs.DirEntry, err error) error {
if err != nil {
return err
}
if d.IsDir() {
return nil
}
info, err := d.Info()
if err != nil {
return err
}
total += info.Size()
return nil
})
if err != nil {
return 0, fmt.Errorf("calculate snapshot size: %w", err)
}
return total, nil
}

View File

@ -0,0 +1,213 @@
package snapshot
import "github.com/google/uuid"
// CreateMapping converts a dirty-block bitset (represented as a []bool) into
// a sorted list of BuildMap entries. Consecutive dirty blocks are merged into
// a single entry. BuildStorageOffset tracks the sequential position in the
// compact diff file.
//
// Inspired by e2b's snapshot system (Apache 2.0, modified by Omukk).
func CreateMapping(buildID uuid.UUID, dirty []bool, blockSize int64) []*BuildMap {
var mappings []*BuildMap
var runStart int64 = -1
var runLength int64
var storageOffset uint64
for i, set := range dirty {
if !set {
if runLength > 0 {
mappings = append(mappings, &BuildMap{
Offset: uint64(runStart) * uint64(blockSize),
Length: uint64(runLength) * uint64(blockSize),
BuildID: buildID,
BuildStorageOffset: storageOffset,
})
storageOffset += uint64(runLength) * uint64(blockSize)
runLength = 0
}
runStart = -1
continue
}
if runStart < 0 {
runStart = int64(i)
runLength = 1
} else {
runLength++
}
}
if runLength > 0 {
mappings = append(mappings, &BuildMap{
Offset: uint64(runStart) * uint64(blockSize),
Length: uint64(runLength) * uint64(blockSize),
BuildID: buildID,
BuildStorageOffset: storageOffset,
})
}
return mappings
}
// MergeMappings overlays diffMapping on top of baseMapping. Where they overlap,
// diff takes priority. The result covers the entire address space.
//
// Both inputs must be sorted by Offset. The base mapping should cover the full size.
//
// Inspired by e2b's snapshot system (Apache 2.0, modified by Omukk).
func MergeMappings(baseMapping, diffMapping []*BuildMap) []*BuildMap {
if len(diffMapping) == 0 {
return baseMapping
}
// Work on a copy of baseMapping to avoid mutating the original.
baseCopy := make([]*BuildMap, len(baseMapping))
for i, m := range baseMapping {
cp := *m
baseCopy[i] = &cp
}
var result []*BuildMap
var bi, di int
for bi < len(baseCopy) && di < len(diffMapping) {
base := baseCopy[bi]
diff := diffMapping[di]
if base.Length == 0 {
bi++
continue
}
if diff.Length == 0 {
di++
continue
}
// No overlap: base entirely before diff.
if base.Offset+base.Length <= diff.Offset {
result = append(result, base)
bi++
continue
}
// No overlap: diff entirely before base.
if diff.Offset+diff.Length <= base.Offset {
result = append(result, diff)
di++
continue
}
// Base fully inside diff — skip base.
if base.Offset >= diff.Offset && base.Offset+base.Length <= diff.Offset+diff.Length {
bi++
continue
}
// Diff fully inside base — split base around diff.
if diff.Offset >= base.Offset && diff.Offset+diff.Length <= base.Offset+base.Length {
leftLen := int64(diff.Offset) - int64(base.Offset)
if leftLen > 0 {
result = append(result, &BuildMap{
Offset: base.Offset,
Length: uint64(leftLen),
BuildID: base.BuildID,
BuildStorageOffset: base.BuildStorageOffset,
})
}
result = append(result, diff)
di++
rightShift := int64(diff.Offset) + int64(diff.Length) - int64(base.Offset)
rightLen := int64(base.Length) - rightShift
if rightLen > 0 {
baseCopy[bi] = &BuildMap{
Offset: base.Offset + uint64(rightShift),
Length: uint64(rightLen),
BuildID: base.BuildID,
BuildStorageOffset: base.BuildStorageOffset + uint64(rightShift),
}
} else {
bi++
}
continue
}
// Base starts after diff with overlap — emit diff, trim base.
if base.Offset > diff.Offset {
result = append(result, diff)
di++
rightShift := int64(diff.Offset) + int64(diff.Length) - int64(base.Offset)
rightLen := int64(base.Length) - rightShift
if rightLen > 0 {
baseCopy[bi] = &BuildMap{
Offset: base.Offset + uint64(rightShift),
Length: uint64(rightLen),
BuildID: base.BuildID,
BuildStorageOffset: base.BuildStorageOffset + uint64(rightShift),
}
} else {
bi++
}
continue
}
// Diff starts after base with overlap — emit left part of base.
if diff.Offset > base.Offset {
leftLen := int64(diff.Offset) - int64(base.Offset)
if leftLen > 0 {
result = append(result, &BuildMap{
Offset: base.Offset,
Length: uint64(leftLen),
BuildID: base.BuildID,
BuildStorageOffset: base.BuildStorageOffset,
})
}
bi++
continue
}
}
// Append remaining entries.
result = append(result, baseCopy[bi:]...)
result = append(result, diffMapping[di:]...)
return result
}
// NormalizeMappings merges adjacent entries with the same BuildID.
func NormalizeMappings(mappings []*BuildMap) []*BuildMap {
if len(mappings) == 0 {
return nil
}
result := make([]*BuildMap, 0, len(mappings))
current := &BuildMap{
Offset: mappings[0].Offset,
Length: mappings[0].Length,
BuildID: mappings[0].BuildID,
BuildStorageOffset: mappings[0].BuildStorageOffset,
}
for i := 1; i < len(mappings); i++ {
m := mappings[i]
if m.BuildID == current.BuildID {
current.Length += m.Length
} else {
result = append(result, current)
current = &BuildMap{
Offset: m.Offset,
Length: m.Length,
BuildID: m.BuildID,
BuildStorageOffset: m.BuildStorageOffset,
}
}
}
result = append(result, current)
return result
}

View File

@ -0,0 +1,189 @@
package snapshot
import (
"fmt"
"io"
"os"
"github.com/google/uuid"
)
const (
// DefaultBlockSize is 4KB — standard page size for Firecracker.
DefaultBlockSize int64 = 4096
)
// ProcessMemfile reads a full memory file produced by Firecracker's
// PUT /snapshot/create, identifies non-zero blocks, and writes only those
// blocks to a compact diff file. Returns the Header describing the mapping.
//
// The output diff file contains non-zero blocks written sequentially.
// The header maps each block in the full address space to either:
// - A position in the diff file (for non-zero blocks)
// - uuid.Nil (for zero/empty blocks, served as zeros without I/O)
//
// buildID identifies this snapshot generation in the header chain.
func ProcessMemfile(memfilePath, diffPath, headerPath string, buildID uuid.UUID) (*Header, error) {
src, err := os.Open(memfilePath)
if err != nil {
return nil, fmt.Errorf("open memfile: %w", err)
}
defer src.Close()
info, err := src.Stat()
if err != nil {
return nil, fmt.Errorf("stat memfile: %w", err)
}
memSize := info.Size()
dst, err := os.Create(diffPath)
if err != nil {
return nil, fmt.Errorf("create diff file: %w", err)
}
defer dst.Close()
totalBlocks := TotalBlocks(memSize, DefaultBlockSize)
dirty := make([]bool, totalBlocks)
empty := make([]bool, totalBlocks)
buf := make([]byte, DefaultBlockSize)
for i := int64(0); i < totalBlocks; i++ {
n, err := io.ReadFull(src, buf)
if err != nil && err != io.ErrUnexpectedEOF {
return nil, fmt.Errorf("read block %d: %w", i, err)
}
// Zero-pad the last block if it's short.
if int64(n) < DefaultBlockSize {
for j := n; j < int(DefaultBlockSize); j++ {
buf[j] = 0
}
}
if isZeroBlock(buf) {
empty[i] = true
continue
}
dirty[i] = true
if _, err := dst.Write(buf); err != nil {
return nil, fmt.Errorf("write diff block %d: %w", i, err)
}
}
// Build header.
dirtyMappings := CreateMapping(buildID, dirty, DefaultBlockSize)
emptyMappings := CreateMapping(uuid.Nil, empty, DefaultBlockSize)
merged := MergeMappings(dirtyMappings, emptyMappings)
normalized := NormalizeMappings(merged)
metadata := NewMetadata(buildID, uint64(DefaultBlockSize), uint64(memSize))
header, err := NewHeader(metadata, normalized)
if err != nil {
return nil, fmt.Errorf("create header: %w", err)
}
// Write header to disk.
headerData, err := Serialize(metadata, normalized)
if err != nil {
return nil, fmt.Errorf("serialize header: %w", err)
}
if err := os.WriteFile(headerPath, headerData, 0644); err != nil {
return nil, fmt.Errorf("write header: %w", err)
}
return header, nil
}
// ProcessMemfileWithParent processes a memory file as a new generation on top
// of an existing parent header. The new diff file contains only blocks that
// differ from what the parent header maps. This is used for re-pause of a
// sandbox that was restored from a snapshot.
func ProcessMemfileWithParent(memfilePath, diffPath, headerPath string, parentHeader *Header, buildID uuid.UUID) (*Header, error) {
src, err := os.Open(memfilePath)
if err != nil {
return nil, fmt.Errorf("open memfile: %w", err)
}
defer src.Close()
info, err := src.Stat()
if err != nil {
return nil, fmt.Errorf("stat memfile: %w", err)
}
memSize := info.Size()
dst, err := os.Create(diffPath)
if err != nil {
return nil, fmt.Errorf("create diff file: %w", err)
}
defer dst.Close()
totalBlocks := TotalBlocks(memSize, DefaultBlockSize)
dirty := make([]bool, totalBlocks)
empty := make([]bool, totalBlocks)
buf := make([]byte, DefaultBlockSize)
for i := int64(0); i < totalBlocks; i++ {
n, err := io.ReadFull(src, buf)
if err != nil && err != io.ErrUnexpectedEOF {
return nil, fmt.Errorf("read block %d: %w", i, err)
}
if int64(n) < DefaultBlockSize {
for j := n; j < int(DefaultBlockSize); j++ {
buf[j] = 0
}
}
if isZeroBlock(buf) {
empty[i] = true
continue
}
dirty[i] = true
if _, err := dst.Write(buf); err != nil {
return nil, fmt.Errorf("write diff block %d: %w", i, err)
}
}
// Build new generation header merged with parent.
dirtyMappings := CreateMapping(buildID, dirty, DefaultBlockSize)
emptyMappings := CreateMapping(uuid.Nil, empty, DefaultBlockSize)
diffMapping := MergeMappings(dirtyMappings, emptyMappings)
merged := MergeMappings(parentHeader.Mapping, diffMapping)
normalized := NormalizeMappings(merged)
metadata := parentHeader.Metadata.NextGeneration(buildID)
header, err := NewHeader(metadata, normalized)
if err != nil {
return nil, fmt.Errorf("create header: %w", err)
}
headerData, err := Serialize(metadata, normalized)
if err != nil {
return nil, fmt.Errorf("serialize header: %w", err)
}
if err := os.WriteFile(headerPath, headerData, 0644); err != nil {
return nil, fmt.Errorf("write header: %w", err)
}
return header, nil
}
// isZeroBlock checks if a block is entirely zero bytes.
func isZeroBlock(block []byte) bool {
// Fast path: compare 8 bytes at a time.
for i := 0; i+8 <= len(block); i += 8 {
if block[i] != 0 || block[i+1] != 0 || block[i+2] != 0 || block[i+3] != 0 ||
block[i+4] != 0 || block[i+5] != 0 || block[i+6] != 0 || block[i+7] != 0 {
return false
}
}
// Tail bytes.
for i := len(block) &^ 7; i < len(block); i++ {
if block[i] != 0 {
return false
}
}
return true
}