Add sandbox snapshot and restore with UFFD lazy memory loading

Implement full snapshot lifecycle: pause (snapshot + free resources),
resume (UFFD-based lazy restore), and named snapshot templates that
can spawn new sandboxes from frozen VM state.

Key changes:
- Snapshot header system with generational diff mapping (inspired by e2b)
- UFFD server for lazy page fault handling during snapshot restore
- Stable rootfs symlink path (/tmp/fc-vm/) for snapshot compatibility
- Templates DB table and CRUD API endpoints (POST/GET/DELETE /v1/snapshots)
- CreateSnapshot/DeleteSnapshot RPCs in hostagent proto
- Reconciler excludes paused sandboxes (expected absent from host agent)
- Snapshot templates lock vcpus/memory to baked-in values
- Proper cleanup of uffd sockets and pause snapshot files on destroy
This commit is contained in:
2026-03-12 09:19:37 +06:00
parent 9b94df7f56
commit a1bd439c75
33 changed files with 2714 additions and 166 deletions

View File

@ -44,7 +44,7 @@ type wsStopMsg struct {
// wsOutMsg is sent by the server for process events.
type wsOutMsg struct {
Type string `json:"type"` // "start", "stdout", "stderr", "exit", "error"
Type string `json:"type"` // "start", "stdout", "stderr", "exit", "error"
PID uint32 `json:"pid,omitempty"` // only for "start"
Data string `json:"data,omitempty"` // only for "stdout", "stderr", "error"
ExitCode *int32 `json:"exit_code,omitempty"` // only for "exit"

View File

@ -97,6 +97,17 @@ func (h *sandboxHandler) Create(w http.ResponseWriter, r *http.Request) {
}
ctx := r.Context()
// If the template is a snapshot, use its baked-in vcpus/memory
// (they cannot be changed since the VM state is frozen).
if tmpl, err := h.db.GetTemplate(ctx, req.Template); err == nil && tmpl.Type == "snapshot" {
if tmpl.Vcpus.Valid {
req.VCPUs = tmpl.Vcpus.Int32
}
if tmpl.MemoryMb.Valid {
req.MemoryMB = tmpl.MemoryMb.Int32
}
}
sandboxID := id.NewSandboxID()
// Insert pending record.
@ -182,6 +193,8 @@ func (h *sandboxHandler) Get(w http.ResponseWriter, r *http.Request) {
}
// Pause handles POST /v1/sandboxes/{id}/pause.
// Pause = snapshot + destroy. The sandbox is frozen to disk and all running
// resources are released. It can be resumed later.
func (h *sandboxHandler) Pause(w http.ResponseWriter, r *http.Request) {
sandboxID := chi.URLParam(r, "id")
ctx := r.Context()
@ -216,6 +229,7 @@ func (h *sandboxHandler) Pause(w http.ResponseWriter, r *http.Request) {
}
// Resume handles POST /v1/sandboxes/{id}/resume.
// Resume restores a paused sandbox from snapshot using UFFD lazy memory loading.
func (h *sandboxHandler) Resume(w http.ResponseWriter, r *http.Request) {
sandboxID := chi.URLParam(r, "id")
ctx := r.Context()
@ -230,16 +244,24 @@ func (h *sandboxHandler) Resume(w http.ResponseWriter, r *http.Request) {
return
}
if _, err := h.agent.ResumeSandbox(ctx, connect.NewRequest(&pb.ResumeSandboxRequest{
resp, err := h.agent.ResumeSandbox(ctx, connect.NewRequest(&pb.ResumeSandboxRequest{
SandboxId: sandboxID,
})); err != nil {
}))
if err != nil {
status, code, msg := agentErrToHTTP(err)
writeError(w, status, code, msg)
return
}
sb, err = h.db.UpdateSandboxStatus(ctx, db.UpdateSandboxStatusParams{
ID: sandboxID, Status: "running",
now := time.Now()
sb, err = h.db.UpdateSandboxRunning(ctx, db.UpdateSandboxRunningParams{
ID: sandboxID,
HostIp: resp.Msg.HostIp,
GuestIp: "",
StartedAt: pgtype.Timestamptz{
Time: now,
Valid: true,
},
})
if err != nil {
writeError(w, http.StatusInternalServerError, "db_error", "failed to update status")

View File

@ -0,0 +1,187 @@
package api
import (
"encoding/json"
"log/slog"
"net/http"
"time"
"connectrpc.com/connect"
"github.com/go-chi/chi/v5"
"github.com/jackc/pgx/v5/pgtype"
"git.omukk.dev/wrenn/sandbox/internal/db"
"git.omukk.dev/wrenn/sandbox/internal/id"
pb "git.omukk.dev/wrenn/sandbox/proto/hostagent/gen"
"git.omukk.dev/wrenn/sandbox/proto/hostagent/gen/hostagentv1connect"
)
type snapshotHandler struct {
db *db.Queries
agent hostagentv1connect.HostAgentServiceClient
}
func newSnapshotHandler(db *db.Queries, agent hostagentv1connect.HostAgentServiceClient) *snapshotHandler {
return &snapshotHandler{db: db, agent: agent}
}
type createSnapshotRequest struct {
SandboxID string `json:"sandbox_id"`
Name string `json:"name"`
}
type snapshotResponse struct {
Name string `json:"name"`
Type string `json:"type"`
VCPUs *int32 `json:"vcpus,omitempty"`
MemoryMB *int32 `json:"memory_mb,omitempty"`
SizeBytes int64 `json:"size_bytes"`
CreatedAt string `json:"created_at"`
}
func templateToResponse(t db.Template) snapshotResponse {
resp := snapshotResponse{
Name: t.Name,
Type: t.Type,
SizeBytes: t.SizeBytes,
}
if t.Vcpus.Valid {
resp.VCPUs = &t.Vcpus.Int32
}
if t.MemoryMb.Valid {
resp.MemoryMB = &t.MemoryMb.Int32
}
if t.CreatedAt.Valid {
resp.CreatedAt = t.CreatedAt.Time.Format(time.RFC3339)
}
return resp
}
// Create handles POST /v1/snapshots.
func (h *snapshotHandler) Create(w http.ResponseWriter, r *http.Request) {
var req createSnapshotRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
writeError(w, http.StatusBadRequest, "invalid_request", "invalid JSON body")
return
}
if req.SandboxID == "" {
writeError(w, http.StatusBadRequest, "invalid_request", "sandbox_id is required")
return
}
if req.Name == "" {
req.Name = id.NewSnapshotName()
}
ctx := r.Context()
overwrite := r.URL.Query().Get("overwrite") == "true"
// Check if name already exists.
if _, err := h.db.GetTemplate(ctx, req.Name); err == nil {
if !overwrite {
writeError(w, http.StatusConflict, "already_exists", "snapshot name already exists; use ?overwrite=true to replace")
return
}
// Delete existing template record and files.
h.db.DeleteTemplate(ctx, req.Name)
}
// Verify sandbox exists and is running or paused.
sb, err := h.db.GetSandbox(ctx, req.SandboxID)
if err != nil {
writeError(w, http.StatusNotFound, "not_found", "sandbox not found")
return
}
if sb.Status != "running" && sb.Status != "paused" {
writeError(w, http.StatusConflict, "invalid_state", "sandbox must be running or paused")
return
}
// Call host agent to create snapshot. If running, the agent pauses it first.
// The sandbox remains paused after this call.
resp, err := h.agent.CreateSnapshot(ctx, connect.NewRequest(&pb.CreateSnapshotRequest{
SandboxId: req.SandboxID,
Name: req.Name,
}))
if err != nil {
status, code, msg := agentErrToHTTP(err)
writeError(w, status, code, msg)
return
}
// Mark sandbox as paused (if it was running, it got paused by the snapshot).
if sb.Status != "paused" {
if _, err := h.db.UpdateSandboxStatus(ctx, db.UpdateSandboxStatusParams{
ID: req.SandboxID, Status: "paused",
}); err != nil {
slog.Error("failed to update sandbox status after snapshot", "sandbox_id", req.SandboxID, "error", err)
}
}
// Insert template record.
tmpl, err := h.db.InsertTemplate(ctx, db.InsertTemplateParams{
Name: req.Name,
Type: "snapshot",
Vcpus: pgtype.Int4{Int32: sb.Vcpus, Valid: true},
MemoryMb: pgtype.Int4{Int32: sb.MemoryMb, Valid: true},
SizeBytes: resp.Msg.SizeBytes,
})
if err != nil {
slog.Error("failed to insert template record", "name", req.Name, "error", err)
writeError(w, http.StatusInternalServerError, "db_error", "snapshot created but failed to record in database")
return
}
writeJSON(w, http.StatusCreated, templateToResponse(tmpl))
}
// List handles GET /v1/snapshots.
func (h *snapshotHandler) List(w http.ResponseWriter, r *http.Request) {
ctx := r.Context()
typeFilter := r.URL.Query().Get("type")
var templates []db.Template
var err error
if typeFilter != "" {
templates, err = h.db.ListTemplatesByType(ctx, typeFilter)
} else {
templates, err = h.db.ListTemplates(ctx)
}
if err != nil {
writeError(w, http.StatusInternalServerError, "db_error", "failed to list templates")
return
}
resp := make([]snapshotResponse, len(templates))
for i, t := range templates {
resp[i] = templateToResponse(t)
}
writeJSON(w, http.StatusOK, resp)
}
// Delete handles DELETE /v1/snapshots/{name}.
func (h *snapshotHandler) Delete(w http.ResponseWriter, r *http.Request) {
name := chi.URLParam(r, "name")
ctx := r.Context()
if _, err := h.db.GetTemplate(ctx, name); err != nil {
writeError(w, http.StatusNotFound, "not_found", "template not found")
return
}
// Delete files on host agent.
if _, err := h.agent.DeleteSnapshot(ctx, connect.NewRequest(&pb.DeleteSnapshotRequest{
Name: name,
})); err != nil {
slog.Warn("delete snapshot: agent RPC failed", "name", name, "error", err)
}
if err := h.db.DeleteTemplate(ctx, name); err != nil {
writeError(w, http.StatusInternalServerError, "db_error", "failed to delete template record")
return
}
w.WriteHeader(http.StatusNoContent)
}

View File

@ -126,9 +126,13 @@ paths:
post:
summary: Pause a running sandbox
operationId: pauseSandbox
description: |
Takes a snapshot of the sandbox (VM state + memory + rootfs), then
destroys all running resources. The sandbox exists only as files on
disk and can be resumed later.
responses:
"200":
description: Sandbox paused
description: Sandbox paused (snapshot taken, resources released)
content:
application/json:
schema:
@ -151,9 +155,13 @@ paths:
post:
summary: Resume a paused sandbox
operationId: resumeSandbox
description: |
Restores a paused sandbox from its snapshot using UFFD for lazy
memory loading. Boots a fresh Firecracker process, sets up a new
network slot, and waits for envd to become ready.
responses:
"200":
description: Sandbox resumed
description: Sandbox resumed (new VM booted from snapshot)
content:
application/json:
schema:
@ -165,6 +173,85 @@ paths:
schema:
$ref: "#/components/schemas/Error"
/v1/snapshots:
post:
summary: Create a snapshot template
operationId: createSnapshot
description: |
Pauses a running sandbox, takes a full snapshot, copies the snapshot
files to the images directory as a reusable template, then destroys
the sandbox. The template can be used to create new sandboxes.
parameters:
- name: overwrite
in: query
required: false
schema:
type: string
enum: ["true"]
description: Set to "true" to overwrite an existing snapshot with the same name.
requestBody:
required: true
content:
application/json:
schema:
$ref: "#/components/schemas/CreateSnapshotRequest"
responses:
"201":
description: Snapshot created
content:
application/json:
schema:
$ref: "#/components/schemas/Template"
"409":
description: Name already exists or sandbox not running
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
get:
summary: List templates
operationId: listSnapshots
parameters:
- name: type
in: query
required: false
schema:
type: string
enum: [base, snapshot]
description: Filter by template type.
responses:
"200":
description: List of templates
content:
application/json:
schema:
type: array
items:
$ref: "#/components/schemas/Template"
/v1/snapshots/{name}:
parameters:
- name: name
in: path
required: true
schema:
type: string
delete:
summary: Delete a snapshot template
operationId: deleteSnapshot
description: Removes the snapshot files from disk and deletes the database record.
responses:
"204":
description: Snapshot deleted
"404":
description: Template not found
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
/v1/sandboxes/{id}/files/write:
parameters:
- name: id
@ -429,6 +516,38 @@ components:
type: string
format: date-time
CreateSnapshotRequest:
type: object
required: [sandbox_id]
properties:
sandbox_id:
type: string
description: ID of the running sandbox to snapshot.
name:
type: string
description: Name for the snapshot template. Auto-generated if omitted.
Template:
type: object
properties:
name:
type: string
type:
type: string
enum: [base, snapshot]
vcpus:
type: integer
nullable: true
memory_mb:
type: integer
nullable: true
size_bytes:
type: integer
format: int64
created_at:
type: string
format: date-time
ExecRequest:
type: object
required: [cmd]

View File

@ -62,10 +62,12 @@ func (rc *Reconciler) reconcile(ctx context.Context) {
alive[sb.SandboxId] = struct{}{}
}
// Get all DB sandboxes for this host that are running or paused.
// Get all DB sandboxes for this host that are running.
// Paused sandboxes are excluded: they are expected to not exist on the
// host agent because pause = snapshot + destroy resources.
dbSandboxes, err := rc.db.ListSandboxesByHostAndStatus(ctx, db.ListSandboxesByHostAndStatusParams{
HostID: rc.hostID,
Column2: []string{"running", "paused"},
Column2: []string{"running"},
})
if err != nil {
slog.Warn("reconciler: failed to list DB sandboxes", "error", err)

View File

@ -29,6 +29,7 @@ func New(queries *db.Queries, agent hostagentv1connect.HostAgentServiceClient) *
execStream := newExecStreamHandler(queries, agent)
files := newFilesHandler(queries, agent)
filesStream := newFilesStreamHandler(queries, agent)
snapshots := newSnapshotHandler(queries, agent)
// OpenAPI spec and docs.
r.Get("/openapi.yaml", serveOpenAPI)
@ -53,6 +54,13 @@ func New(queries *db.Queries, agent hostagentv1connect.HostAgentServiceClient) *
})
})
// Snapshot / template management.
r.Route("/v1/snapshots", func(r chi.Router) {
r.Post("/", snapshots.Create)
r.Get("/", snapshots.List)
r.Delete("/{name}", snapshots.Delete)
})
return &Server{router: r}
}

View File

@ -24,3 +24,12 @@ type Sandbox struct {
LastActiveAt pgtype.Timestamptz `json:"last_active_at"`
LastUpdated pgtype.Timestamptz `json:"last_updated"`
}
type Template struct {
Name string `json:"name"`
Type string `json:"type"`
Vcpus pgtype.Int4 `json:"vcpus"`
MemoryMb pgtype.Int4 `json:"memory_mb"`
SizeBytes int64 `json:"size_bytes"`
CreatedAt pgtype.Timestamptz `json:"created_at"`
}

View File

@ -0,0 +1,135 @@
// Code generated by sqlc. DO NOT EDIT.
// versions:
// sqlc v1.30.0
// source: templates.sql
package db
import (
"context"
"github.com/jackc/pgx/v5/pgtype"
)
const deleteTemplate = `-- name: DeleteTemplate :exec
DELETE FROM templates WHERE name = $1
`
func (q *Queries) DeleteTemplate(ctx context.Context, name string) error {
_, err := q.db.Exec(ctx, deleteTemplate, name)
return err
}
const getTemplate = `-- name: GetTemplate :one
SELECT name, type, vcpus, memory_mb, size_bytes, created_at FROM templates WHERE name = $1
`
func (q *Queries) GetTemplate(ctx context.Context, name string) (Template, error) {
row := q.db.QueryRow(ctx, getTemplate, name)
var i Template
err := row.Scan(
&i.Name,
&i.Type,
&i.Vcpus,
&i.MemoryMb,
&i.SizeBytes,
&i.CreatedAt,
)
return i, err
}
const insertTemplate = `-- name: InsertTemplate :one
INSERT INTO templates (name, type, vcpus, memory_mb, size_bytes)
VALUES ($1, $2, $3, $4, $5)
RETURNING name, type, vcpus, memory_mb, size_bytes, created_at
`
type InsertTemplateParams struct {
Name string `json:"name"`
Type string `json:"type"`
Vcpus pgtype.Int4 `json:"vcpus"`
MemoryMb pgtype.Int4 `json:"memory_mb"`
SizeBytes int64 `json:"size_bytes"`
}
func (q *Queries) InsertTemplate(ctx context.Context, arg InsertTemplateParams) (Template, error) {
row := q.db.QueryRow(ctx, insertTemplate,
arg.Name,
arg.Type,
arg.Vcpus,
arg.MemoryMb,
arg.SizeBytes,
)
var i Template
err := row.Scan(
&i.Name,
&i.Type,
&i.Vcpus,
&i.MemoryMb,
&i.SizeBytes,
&i.CreatedAt,
)
return i, err
}
const listTemplates = `-- name: ListTemplates :many
SELECT name, type, vcpus, memory_mb, size_bytes, created_at FROM templates ORDER BY created_at DESC
`
func (q *Queries) ListTemplates(ctx context.Context) ([]Template, error) {
rows, err := q.db.Query(ctx, listTemplates)
if err != nil {
return nil, err
}
defer rows.Close()
var items []Template
for rows.Next() {
var i Template
if err := rows.Scan(
&i.Name,
&i.Type,
&i.Vcpus,
&i.MemoryMb,
&i.SizeBytes,
&i.CreatedAt,
); err != nil {
return nil, err
}
items = append(items, i)
}
if err := rows.Err(); err != nil {
return nil, err
}
return items, nil
}
const listTemplatesByType = `-- name: ListTemplatesByType :many
SELECT name, type, vcpus, memory_mb, size_bytes, created_at FROM templates WHERE type = $1 ORDER BY created_at DESC
`
func (q *Queries) ListTemplatesByType(ctx context.Context, type_ string) ([]Template, error) {
rows, err := q.db.Query(ctx, listTemplatesByType, type_)
if err != nil {
return nil, err
}
defer rows.Close()
var items []Template
for rows.Next() {
var i Template
if err := rows.Scan(
&i.Name,
&i.Type,
&i.Vcpus,
&i.MemoryMb,
&i.SizeBytes,
&i.CreatedAt,
); err != nil {
return nil, err
}
items = append(items, i)
}
if err := rows.Err(); err != nil {
return nil, err
}
return items, nil
}

View File

@ -71,10 +71,39 @@ func (s *Server) ResumeSandbox(
ctx context.Context,
req *connect.Request[pb.ResumeSandboxRequest],
) (*connect.Response[pb.ResumeSandboxResponse], error) {
if err := s.mgr.Resume(ctx, req.Msg.SandboxId); err != nil {
sb, err := s.mgr.Resume(ctx, req.Msg.SandboxId)
if err != nil {
return nil, connect.NewError(connect.CodeInternal, err)
}
return connect.NewResponse(&pb.ResumeSandboxResponse{}), nil
return connect.NewResponse(&pb.ResumeSandboxResponse{
SandboxId: sb.ID,
Status: string(sb.Status),
HostIp: sb.HostIP.String(),
}), nil
}
func (s *Server) CreateSnapshot(
ctx context.Context,
req *connect.Request[pb.CreateSnapshotRequest],
) (*connect.Response[pb.CreateSnapshotResponse], error) {
sizeBytes, err := s.mgr.CreateSnapshot(ctx, req.Msg.SandboxId, req.Msg.Name)
if err != nil {
return nil, connect.NewError(connect.CodeInternal, fmt.Errorf("create snapshot: %w", err))
}
return connect.NewResponse(&pb.CreateSnapshotResponse{
Name: req.Msg.Name,
SizeBytes: sizeBytes,
}), nil
}
func (s *Server) DeleteSnapshot(
ctx context.Context,
req *connect.Request[pb.DeleteSnapshotRequest],
) (*connect.Response[pb.DeleteSnapshotResponse], error) {
if err := s.mgr.DeleteSnapshot(req.Msg.Name); err != nil {
return nil, connect.NewError(connect.CodeInternal, fmt.Errorf("delete snapshot: %w", err))
}
return connect.NewResponse(&pb.DeleteSnapshotResponse{}), nil
}
func (s *Server) Exec(
@ -352,15 +381,15 @@ func (s *Server) ListSandboxes(
infos := make([]*pb.SandboxInfo, len(sandboxes))
for i, sb := range sandboxes {
infos[i] = &pb.SandboxInfo{
SandboxId: sb.ID,
Status: string(sb.Status),
Template: sb.Template,
Vcpus: int32(sb.VCPUs),
MemoryMb: int32(sb.MemoryMB),
HostIp: sb.HostIP.String(),
CreatedAtUnix: sb.CreatedAt.Unix(),
SandboxId: sb.ID,
Status: string(sb.Status),
Template: sb.Template,
Vcpus: int32(sb.VCPUs),
MemoryMb: int32(sb.MemoryMB),
HostIp: sb.HostIP.String(),
CreatedAtUnix: sb.CreatedAt.Unix(),
LastActiveAtUnix: sb.LastActiveAt.Unix(),
TimeoutSec: int32(sb.TimeoutSec),
TimeoutSec: int32(sb.TimeoutSec),
}
}

View File

@ -14,3 +14,12 @@ func NewSandboxID() string {
}
return "sb-" + hex.EncodeToString(b)
}
// NewSnapshotName generates a snapshot name in the format "template-" + 8 hex chars.
func NewSnapshotName() string {
b := make([]byte, 4)
if _, err := rand.Read(b); err != nil {
panic(fmt.Sprintf("crypto/rand failed: %v", err))
}
return "template-" + hex.EncodeToString(b)
}

View File

@ -9,19 +9,24 @@ import (
"sync"
"time"
"github.com/google/uuid"
"git.omukk.dev/wrenn/sandbox/internal/envdclient"
"git.omukk.dev/wrenn/sandbox/internal/filesystem"
"git.omukk.dev/wrenn/sandbox/internal/id"
"git.omukk.dev/wrenn/sandbox/internal/models"
"git.omukk.dev/wrenn/sandbox/internal/network"
"git.omukk.dev/wrenn/sandbox/internal/snapshot"
"git.omukk.dev/wrenn/sandbox/internal/uffd"
"git.omukk.dev/wrenn/sandbox/internal/vm"
)
// Config holds the paths and defaults for the sandbox manager.
type Config struct {
KernelPath string
ImagesDir string // directory containing base rootfs images (e.g., /var/lib/wrenn/images/minimal.ext4)
ImagesDir string // directory containing template images (e.g., /var/lib/wrenn/images/{name}/rootfs.ext4)
SandboxesDir string // directory for per-sandbox rootfs clones (e.g., /var/lib/wrenn/sandboxes)
SnapshotsDir string // directory for pause snapshots (e.g., /var/lib/wrenn/snapshots/{sandbox-id}/)
EnvdTimeout time.Duration
}
@ -38,8 +43,9 @@ type Manager struct {
// sandboxState holds the runtime state for a single sandbox.
type sandboxState struct {
models.Sandbox
slot *network.Slot
client *envdclient.Client
slot *network.Slot
client *envdclient.Client
uffdSocketPath string // non-empty for sandboxes restored from snapshot
}
// New creates a new sandbox manager.
@ -74,8 +80,13 @@ func (m *Manager) Create(ctx context.Context, sandboxID, template string, vcpus,
template = "minimal"
}
// Resolve base rootfs image: /var/lib/wrenn/images/{template}.ext4
baseRootfs := filepath.Join(m.cfg.ImagesDir, template+".ext4")
// Check if template refers to a snapshot (has snapfile + memfile + header + rootfs).
if snapshot.IsSnapshot(m.cfg.ImagesDir, template) {
return m.createFromSnapshot(ctx, sandboxID, template, vcpus, memoryMB, timeoutSec)
}
// Resolve base rootfs image: /var/lib/wrenn/images/{template}/rootfs.ext4
baseRootfs := filepath.Join(m.cfg.ImagesDir, template, "rootfs.ext4")
if _, err := os.Stat(baseRootfs); err != nil {
return nil, fmt.Errorf("base rootfs not found at %s: %w", baseRootfs, err)
}
@ -168,18 +179,22 @@ func (m *Manager) Create(ctx context.Context, sandboxID, template string, vcpus,
return &sb.Sandbox, nil
}
// Destroy stops and cleans up a sandbox.
// Destroy stops and cleans up a sandbox. If the sandbox is running, its VM,
// network, and rootfs are torn down. Any pause snapshot files are also removed.
func (m *Manager) Destroy(ctx context.Context, sandboxID string) error {
m.mu.Lock()
sb, ok := m.boxes[sandboxID]
if !ok {
m.mu.Unlock()
return fmt.Errorf("sandbox not found: %s", sandboxID)
if ok {
delete(m.boxes, sandboxID)
}
delete(m.boxes, sandboxID)
m.mu.Unlock()
m.cleanup(ctx, sb)
if ok {
m.cleanup(ctx, sb)
}
// Always clean up pause snapshot files (may exist if sandbox was paused).
snapshot.Remove(m.cfg.SnapshotsDir, sandboxID)
slog.Info("sandbox destroyed", "id", sandboxID)
return nil
@ -195,9 +210,14 @@ func (m *Manager) cleanup(ctx context.Context, sb *sandboxState) {
}
m.slots.Release(sb.SlotIndex)
os.Remove(sb.RootfsPath)
if sb.uffdSocketPath != "" {
os.Remove(sb.uffdSocketPath)
}
}
// Pause pauses a running sandbox.
// Pause takes a snapshot of a running sandbox, then destroys all resources.
// The sandbox's snapshot files are stored at SnapshotsDir/{sandboxID}/.
// After this call, the sandbox is no longer running but can be resumed.
func (m *Manager) Pause(ctx context.Context, sandboxID string) error {
sb, err := m.get(sandboxID)
if err != nil {
@ -208,40 +228,386 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error {
return fmt.Errorf("sandbox %s is not running (status: %s)", sandboxID, sb.Status)
}
// Step 1: Pause the VM (freeze vCPUs).
if err := m.vm.Pause(ctx, sandboxID); err != nil {
return fmt.Errorf("pause VM: %w", err)
}
// Step 2: Take a full snapshot (snapfile + memfile).
if err := snapshot.EnsureDir(m.cfg.SnapshotsDir, sandboxID); err != nil {
return fmt.Errorf("create snapshot dir: %w", err)
}
snapDir := snapshot.DirPath(m.cfg.SnapshotsDir, sandboxID)
rawMemPath := filepath.Join(snapDir, "memfile.raw")
snapPath := snapshot.SnapPath(m.cfg.SnapshotsDir, sandboxID)
if err := m.vm.Snapshot(ctx, sandboxID, snapPath, rawMemPath); err != nil {
snapshot.Remove(m.cfg.SnapshotsDir, sandboxID)
return fmt.Errorf("create VM snapshot: %w", err)
}
// Step 3: Process the raw memfile into a compact diff + header.
buildID := uuid.New()
diffPath := snapshot.MemDiffPath(m.cfg.SnapshotsDir, sandboxID)
headerPath := snapshot.MemHeaderPath(m.cfg.SnapshotsDir, sandboxID)
if _, err := snapshot.ProcessMemfile(rawMemPath, diffPath, headerPath, buildID); err != nil {
snapshot.Remove(m.cfg.SnapshotsDir, sandboxID)
return fmt.Errorf("process memfile: %w", err)
}
// Remove the raw memfile — we only keep the compact diff.
os.Remove(rawMemPath)
// Step 4: Copy rootfs into snapshot dir.
snapshotRootfs := snapshot.RootfsPath(m.cfg.SnapshotsDir, sandboxID)
if err := filesystem.CloneRootfs(sb.RootfsPath, snapshotRootfs); err != nil {
snapshot.Remove(m.cfg.SnapshotsDir, sandboxID)
return fmt.Errorf("copy rootfs: %w", err)
}
// Step 5: Destroy the sandbox (free VM, network, rootfs clone).
m.mu.Lock()
sb.Status = models.StatusPaused
delete(m.boxes, sandboxID)
m.mu.Unlock()
slog.Info("sandbox paused", "id", sandboxID)
m.cleanup(ctx, sb)
slog.Info("sandbox paused (snapshot + destroy)", "id", sandboxID)
return nil
}
// Resume resumes a paused sandbox.
func (m *Manager) Resume(ctx context.Context, sandboxID string) error {
sb, err := m.get(sandboxID)
// Resume restores a paused sandbox from its snapshot using UFFD for
// lazy memory loading. The sandbox gets a new network slot.
func (m *Manager) Resume(ctx context.Context, sandboxID string) (*models.Sandbox, error) {
snapDir := m.cfg.SnapshotsDir
if !snapshot.Exists(snapDir, sandboxID) {
return nil, fmt.Errorf("no snapshot found for sandbox %s", sandboxID)
}
// Read the header to set up the UFFD memory source.
headerData, err := os.ReadFile(snapshot.MemHeaderPath(snapDir, sandboxID))
if err != nil {
return err
return nil, fmt.Errorf("read header: %w", err)
}
if sb.Status != models.StatusPaused {
return fmt.Errorf("sandbox %s is not paused (status: %s)", sandboxID, sb.Status)
header, err := snapshot.Deserialize(headerData)
if err != nil {
return nil, fmt.Errorf("deserialize header: %w", err)
}
if err := m.vm.Resume(ctx, sandboxID); err != nil {
return fmt.Errorf("resume VM: %w", err)
// Build diff file map (build ID → file path).
diffPaths := map[string]string{
header.Metadata.BuildID.String(): snapshot.MemDiffPath(snapDir, sandboxID),
}
source, err := uffd.NewDiffFileSource(header, diffPaths)
if err != nil {
return nil, fmt.Errorf("create memory source: %w", err)
}
// Clone snapshot rootfs for this sandbox.
snapshotRootfs := snapshot.RootfsPath(snapDir, sandboxID)
rootfsPath := filepath.Join(m.cfg.SandboxesDir, fmt.Sprintf("%s-resume.ext4", sandboxID))
if err := filesystem.CloneRootfs(snapshotRootfs, rootfsPath); err != nil {
source.Close()
return nil, fmt.Errorf("clone snapshot rootfs: %w", err)
}
// Allocate network slot.
slotIdx, err := m.slots.Allocate()
if err != nil {
source.Close()
os.Remove(rootfsPath)
return nil, fmt.Errorf("allocate network slot: %w", err)
}
slot := network.NewSlot(slotIdx)
if err := network.CreateNetwork(slot); err != nil {
source.Close()
m.slots.Release(slotIdx)
os.Remove(rootfsPath)
return nil, fmt.Errorf("create network: %w", err)
}
// Start UFFD server.
uffdSocketPath := filepath.Join(m.cfg.SandboxesDir, fmt.Sprintf("%s-uffd.sock", sandboxID))
os.Remove(uffdSocketPath) // Clean stale socket.
uffdServer := uffd.NewServer(uffdSocketPath, source)
if err := uffdServer.Start(ctx); err != nil {
source.Close()
network.RemoveNetwork(slot)
m.slots.Release(slotIdx)
os.Remove(rootfsPath)
return nil, fmt.Errorf("start uffd server: %w", err)
}
// Restore VM from snapshot.
vmCfg := vm.VMConfig{
SandboxID: sandboxID,
KernelPath: m.cfg.KernelPath,
RootfsPath: rootfsPath,
VCPUs: int(header.Metadata.Size / (1024 * 1024)), // Will be overridden by snapshot.
MemoryMB: int(header.Metadata.Size / (1024 * 1024)),
NetworkNamespace: slot.NamespaceID,
TapDevice: slot.TapName,
TapMAC: slot.TapMAC,
GuestIP: slot.GuestIP,
GatewayIP: slot.TapIP,
NetMask: slot.GuestNetMask,
}
snapPath := snapshot.SnapPath(snapDir, sandboxID)
if _, err := m.vm.CreateFromSnapshot(ctx, vmCfg, snapPath, uffdSocketPath); err != nil {
uffdServer.Stop()
source.Close()
network.RemoveNetwork(slot)
m.slots.Release(slotIdx)
os.Remove(rootfsPath)
return nil, fmt.Errorf("restore VM from snapshot: %w", err)
}
// Wait for envd to be ready.
client := envdclient.New(slot.HostIP.String())
waitCtx, waitCancel := context.WithTimeout(ctx, m.cfg.EnvdTimeout)
defer waitCancel()
if err := client.WaitUntilReady(waitCtx); err != nil {
uffdServer.Stop()
source.Close()
m.vm.Destroy(context.Background(), sandboxID)
network.RemoveNetwork(slot)
m.slots.Release(slotIdx)
os.Remove(rootfsPath)
return nil, fmt.Errorf("wait for envd: %w", err)
}
now := time.Now()
sb := &sandboxState{
Sandbox: models.Sandbox{
ID: sandboxID,
Status: models.StatusRunning,
Template: "",
VCPUs: vmCfg.VCPUs,
MemoryMB: vmCfg.MemoryMB,
TimeoutSec: 0,
SlotIndex: slotIdx,
HostIP: slot.HostIP,
RootfsPath: rootfsPath,
CreatedAt: now,
LastActiveAt: now,
},
slot: slot,
client: client,
uffdSocketPath: uffdSocketPath,
}
m.mu.Lock()
sb.Status = models.StatusRunning
sb.LastActiveAt = time.Now()
m.boxes[sandboxID] = sb
m.mu.Unlock()
slog.Info("sandbox resumed", "id", sandboxID)
return nil
// Clean up the snapshot files now that the sandbox is running.
snapshot.Remove(snapDir, sandboxID)
slog.Info("sandbox resumed from snapshot",
"id", sandboxID,
"host_ip", slot.HostIP.String(),
)
return &sb.Sandbox, nil
}
// CreateSnapshot creates a reusable template from a sandbox. Works on both
// running and paused sandboxes. If the sandbox is running, it is paused first.
// The sandbox remains paused after this call (it can still be resumed).
// The template files are copied to ImagesDir/{name}/.
func (m *Manager) CreateSnapshot(ctx context.Context, sandboxID, name string) (int64, error) {
// If the sandbox is running, pause it first.
if _, err := m.get(sandboxID); err == nil {
if err := m.Pause(ctx, sandboxID); err != nil {
return 0, fmt.Errorf("pause sandbox: %w", err)
}
}
// At this point, pause snapshot files must exist in SnapshotsDir/{sandboxID}/.
if !snapshot.Exists(m.cfg.SnapshotsDir, sandboxID) {
return 0, fmt.Errorf("no snapshot found for sandbox %s", sandboxID)
}
// Copy snapshot files to ImagesDir/{name}/ as a reusable template.
if err := snapshot.EnsureDir(m.cfg.ImagesDir, name); err != nil {
return 0, fmt.Errorf("create template dir: %w", err)
}
srcDir := snapshot.DirPath(m.cfg.SnapshotsDir, sandboxID)
dstDir := snapshot.DirPath(m.cfg.ImagesDir, name)
for _, fname := range []string{snapshot.SnapFileName, snapshot.MemDiffName, snapshot.MemHeaderName, snapshot.RootfsFileName} {
src := filepath.Join(srcDir, fname)
dst := filepath.Join(dstDir, fname)
if err := filesystem.CloneRootfs(src, dst); err != nil {
snapshot.Remove(m.cfg.ImagesDir, name)
return 0, fmt.Errorf("copy %s: %w", fname, err)
}
}
sizeBytes, err := snapshot.DirSize(m.cfg.ImagesDir, name)
if err != nil {
slog.Warn("failed to calculate snapshot size", "error", err)
}
slog.Info("snapshot created",
"sandbox", sandboxID,
"name", name,
"size_bytes", sizeBytes,
)
return sizeBytes, nil
}
// DeleteSnapshot removes a snapshot template from disk.
func (m *Manager) DeleteSnapshot(name string) error {
return snapshot.Remove(m.cfg.ImagesDir, name)
}
// createFromSnapshot creates a new sandbox by restoring from a snapshot template
// in ImagesDir/{snapshotName}/. Uses UFFD for lazy memory loading.
func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID, snapshotName string, vcpus, memoryMB, timeoutSec int) (*models.Sandbox, error) {
imagesDir := m.cfg.ImagesDir
// Read the header.
headerData, err := os.ReadFile(snapshot.MemHeaderPath(imagesDir, snapshotName))
if err != nil {
return nil, fmt.Errorf("read snapshot header: %w", err)
}
header, err := snapshot.Deserialize(headerData)
if err != nil {
return nil, fmt.Errorf("deserialize header: %w", err)
}
// Snapshot determines memory size. VCPUs are also baked into the
// snapshot state — the caller should pass the correct value from
// the template DB record.
memoryMB = int(header.Metadata.Size / (1024 * 1024))
// Build diff file map.
diffPaths := map[string]string{
header.Metadata.BuildID.String(): snapshot.MemDiffPath(imagesDir, snapshotName),
}
source, err := uffd.NewDiffFileSource(header, diffPaths)
if err != nil {
return nil, fmt.Errorf("create memory source: %w", err)
}
// Clone snapshot rootfs.
snapshotRootfs := snapshot.RootfsPath(imagesDir, snapshotName)
rootfsPath := filepath.Join(m.cfg.SandboxesDir, fmt.Sprintf("%s-%s.ext4", sandboxID, snapshotName))
if err := filesystem.CloneRootfs(snapshotRootfs, rootfsPath); err != nil {
source.Close()
return nil, fmt.Errorf("clone snapshot rootfs: %w", err)
}
// Allocate network.
slotIdx, err := m.slots.Allocate()
if err != nil {
source.Close()
os.Remove(rootfsPath)
return nil, fmt.Errorf("allocate network slot: %w", err)
}
slot := network.NewSlot(slotIdx)
if err := network.CreateNetwork(slot); err != nil {
source.Close()
m.slots.Release(slotIdx)
os.Remove(rootfsPath)
return nil, fmt.Errorf("create network: %w", err)
}
// Start UFFD server.
uffdSocketPath := filepath.Join(m.cfg.SandboxesDir, fmt.Sprintf("%s-uffd.sock", sandboxID))
os.Remove(uffdSocketPath)
uffdServer := uffd.NewServer(uffdSocketPath, source)
if err := uffdServer.Start(ctx); err != nil {
source.Close()
network.RemoveNetwork(slot)
m.slots.Release(slotIdx)
os.Remove(rootfsPath)
return nil, fmt.Errorf("start uffd server: %w", err)
}
// Restore VM.
vmCfg := vm.VMConfig{
SandboxID: sandboxID,
KernelPath: m.cfg.KernelPath,
RootfsPath: rootfsPath,
VCPUs: vcpus,
MemoryMB: memoryMB,
NetworkNamespace: slot.NamespaceID,
TapDevice: slot.TapName,
TapMAC: slot.TapMAC,
GuestIP: slot.GuestIP,
GatewayIP: slot.TapIP,
NetMask: slot.GuestNetMask,
}
snapPath := snapshot.SnapPath(imagesDir, snapshotName)
if _, err := m.vm.CreateFromSnapshot(ctx, vmCfg, snapPath, uffdSocketPath); err != nil {
uffdServer.Stop()
source.Close()
network.RemoveNetwork(slot)
m.slots.Release(slotIdx)
os.Remove(rootfsPath)
return nil, fmt.Errorf("restore VM from snapshot: %w", err)
}
// Wait for envd.
client := envdclient.New(slot.HostIP.String())
waitCtx, waitCancel := context.WithTimeout(ctx, m.cfg.EnvdTimeout)
defer waitCancel()
if err := client.WaitUntilReady(waitCtx); err != nil {
uffdServer.Stop()
source.Close()
m.vm.Destroy(context.Background(), sandboxID)
network.RemoveNetwork(slot)
m.slots.Release(slotIdx)
os.Remove(rootfsPath)
return nil, fmt.Errorf("wait for envd: %w", err)
}
now := time.Now()
sb := &sandboxState{
Sandbox: models.Sandbox{
ID: sandboxID,
Status: models.StatusRunning,
Template: snapshotName,
VCPUs: vcpus,
MemoryMB: memoryMB,
TimeoutSec: timeoutSec,
SlotIndex: slotIdx,
HostIP: slot.HostIP,
RootfsPath: rootfsPath,
CreatedAt: now,
LastActiveAt: now,
},
slot: slot,
client: client,
uffdSocketPath: uffdSocketPath,
}
m.mu.Lock()
m.boxes[sandboxID] = sb
m.mu.Unlock()
slog.Info("sandbox created from snapshot",
"id", sandboxID,
"snapshot", snapshotName,
"host_ip", slot.HostIP.String(),
)
return &sb.Sandbox, nil
}
// Exec runs a command inside a sandbox.

220
internal/snapshot/header.go Normal file
View File

@ -0,0 +1,220 @@
// Package snapshot implements snapshot storage, header-based memory mapping,
// and memory file processing for Firecracker VM snapshots.
//
// The header system implements a generational copy-on-write memory mapping.
// Each snapshot generation stores only the blocks that changed since the
// previous generation. A Header contains a sorted list of BuildMap entries
// that together cover the entire memory address space, with each entry
// pointing to a specific generation's diff file.
//
// Inspired by e2b's snapshot system (Apache 2.0, modified by Omukk).
package snapshot
import (
"bytes"
"context"
"encoding/binary"
"errors"
"fmt"
"io"
"github.com/google/uuid"
)
const metadataVersion = 1
// Metadata is the fixed-size header prefix describing the snapshot memory layout.
// Binary layout (little-endian, 64 bytes total):
//
// Version uint64 (8 bytes)
// BlockSize uint64 (8 bytes)
// Size uint64 (8 bytes) — total memory size in bytes
// Generation uint64 (8 bytes)
// BuildID [16]byte (UUID)
// BaseBuildID [16]byte (UUID)
type Metadata struct {
Version uint64
BlockSize uint64
Size uint64
Generation uint64
BuildID uuid.UUID
BaseBuildID uuid.UUID
}
// NewMetadata creates metadata for a first-generation snapshot.
func NewMetadata(buildID uuid.UUID, blockSize, size uint64) *Metadata {
return &Metadata{
Version: metadataVersion,
Generation: 0,
BlockSize: blockSize,
Size: size,
BuildID: buildID,
BaseBuildID: buildID,
}
}
// NextGeneration creates metadata for the next generation in the chain.
func (m *Metadata) NextGeneration(buildID uuid.UUID) *Metadata {
return &Metadata{
Version: m.Version,
Generation: m.Generation + 1,
BlockSize: m.BlockSize,
Size: m.Size,
BuildID: buildID,
BaseBuildID: m.BaseBuildID,
}
}
// BuildMap maps a contiguous range of the memory address space to a specific
// generation's diff file. Binary layout (little-endian, 40 bytes):
//
// Offset uint64 — byte offset in the virtual address space
// Length uint64 — byte count (multiple of BlockSize)
// BuildID [16]byte — which generation's diff file, uuid.Nil = zero-fill
// BuildStorageOffset uint64 — byte offset within that generation's diff file
type BuildMap struct {
Offset uint64
Length uint64
BuildID uuid.UUID
BuildStorageOffset uint64
}
// Header is the in-memory representation of a snapshot's memory mapping.
// It provides O(log N) lookup from any memory offset to the correct
// generation's diff file and offset within it.
type Header struct {
Metadata *Metadata
Mapping []*BuildMap
// blockStarts tracks which block indices start a new BuildMap entry.
// startMap provides direct access from block index to the BuildMap.
blockStarts []bool
startMap map[int64]*BuildMap
}
// NewHeader creates a Header from metadata and mapping entries.
// If mapping is nil/empty, a single entry covering the full size is created.
func NewHeader(metadata *Metadata, mapping []*BuildMap) (*Header, error) {
if metadata.BlockSize == 0 {
return nil, fmt.Errorf("block size cannot be zero")
}
if len(mapping) == 0 {
mapping = []*BuildMap{{
Offset: 0,
Length: metadata.Size,
BuildID: metadata.BuildID,
BuildStorageOffset: 0,
}}
}
blocks := TotalBlocks(int64(metadata.Size), int64(metadata.BlockSize))
starts := make([]bool, blocks)
startMap := make(map[int64]*BuildMap, len(mapping))
for _, m := range mapping {
idx := BlockIdx(int64(m.Offset), int64(metadata.BlockSize))
if idx >= 0 && idx < blocks {
starts[idx] = true
startMap[idx] = m
}
}
return &Header{
Metadata: metadata,
Mapping: mapping,
blockStarts: starts,
startMap: startMap,
}, nil
}
// GetShiftedMapping resolves a memory offset to the corresponding diff file
// offset, remaining length, and build ID. This is the hot path called for
// every UFFD page fault.
func (h *Header) GetShiftedMapping(_ context.Context, offset int64) (mappedOffset int64, mappedLength int64, buildID *uuid.UUID, err error) {
if offset < 0 || offset >= int64(h.Metadata.Size) {
return 0, 0, nil, fmt.Errorf("offset %d out of bounds (size: %d)", offset, h.Metadata.Size)
}
blockSize := int64(h.Metadata.BlockSize)
block := BlockIdx(offset, blockSize)
// Walk backwards to find the BuildMap that contains this block.
start := block
for start >= 0 {
if h.blockStarts[start] {
break
}
start--
}
if start < 0 {
return 0, 0, nil, fmt.Errorf("no mapping found for offset %d", offset)
}
m, ok := h.startMap[start]
if !ok {
return 0, 0, nil, fmt.Errorf("no mapping at block %d", start)
}
shift := (block - start) * blockSize
if shift >= int64(m.Length) {
return 0, 0, nil, fmt.Errorf("offset %d beyond mapping end (mapping offset=%d, length=%d)", offset, m.Offset, m.Length)
}
return int64(m.BuildStorageOffset) + shift, int64(m.Length) - shift, &m.BuildID, nil
}
// Serialize writes metadata + mapping entries to binary (little-endian).
func Serialize(metadata *Metadata, mappings []*BuildMap) ([]byte, error) {
var buf bytes.Buffer
if err := binary.Write(&buf, binary.LittleEndian, metadata); err != nil {
return nil, fmt.Errorf("write metadata: %w", err)
}
for _, m := range mappings {
if err := binary.Write(&buf, binary.LittleEndian, m); err != nil {
return nil, fmt.Errorf("write mapping: %w", err)
}
}
return buf.Bytes(), nil
}
// Deserialize reads a header from binary data.
func Deserialize(data []byte) (*Header, error) {
reader := bytes.NewReader(data)
var metadata Metadata
if err := binary.Read(reader, binary.LittleEndian, &metadata); err != nil {
return nil, fmt.Errorf("read metadata: %w", err)
}
var mappings []*BuildMap
for {
var m BuildMap
if err := binary.Read(reader, binary.LittleEndian, &m); err != nil {
if errors.Is(err, io.EOF) {
break
}
return nil, fmt.Errorf("read mapping: %w", err)
}
mappings = append(mappings, &m)
}
return NewHeader(&metadata, mappings)
}
// Block index helpers.
func TotalBlocks(size, blockSize int64) int64 {
return (size + blockSize - 1) / blockSize
}
func BlockIdx(offset, blockSize int64) int64 {
return offset / blockSize
}
func BlockOffset(idx, blockSize int64) int64 {
return idx * blockSize
}

View File

@ -1 +1,101 @@
package snapshot
import (
"fmt"
"io/fs"
"os"
"path/filepath"
)
const (
SnapFileName = "snapfile"
MemDiffName = "memfile"
MemHeaderName = "memfile.header"
RootfsFileName = "rootfs.ext4"
)
// DirPath returns the snapshot directory for a given name.
func DirPath(baseDir, name string) string {
return filepath.Join(baseDir, name)
}
// SnapPath returns the path to the VM state snapshot file.
func SnapPath(baseDir, name string) string {
return filepath.Join(DirPath(baseDir, name), SnapFileName)
}
// MemDiffPath returns the path to the compact memory diff file.
func MemDiffPath(baseDir, name string) string {
return filepath.Join(DirPath(baseDir, name), MemDiffName)
}
// MemHeaderPath returns the path to the memory mapping header file.
func MemHeaderPath(baseDir, name string) string {
return filepath.Join(DirPath(baseDir, name), MemHeaderName)
}
// RootfsPath returns the path to the rootfs image.
func RootfsPath(baseDir, name string) string {
return filepath.Join(DirPath(baseDir, name), RootfsFileName)
}
// Exists reports whether a complete snapshot exists (all required files present).
func Exists(baseDir, name string) bool {
dir := DirPath(baseDir, name)
for _, f := range []string{SnapFileName, MemDiffName, MemHeaderName, RootfsFileName} {
if _, err := os.Stat(filepath.Join(dir, f)); err != nil {
return false
}
}
return true
}
// IsTemplate reports whether a template image directory exists (has rootfs.ext4).
func IsTemplate(baseDir, name string) bool {
_, err := os.Stat(filepath.Join(DirPath(baseDir, name), RootfsFileName))
return err == nil
}
// IsSnapshot reports whether a directory is a snapshot (has all snapshot files).
func IsSnapshot(baseDir, name string) bool {
return Exists(baseDir, name)
}
// EnsureDir creates the snapshot directory if it doesn't exist.
func EnsureDir(baseDir, name string) error {
dir := DirPath(baseDir, name)
if err := os.MkdirAll(dir, 0755); err != nil {
return fmt.Errorf("create snapshot dir %s: %w", dir, err)
}
return nil
}
// Remove deletes the entire snapshot directory.
func Remove(baseDir, name string) error {
return os.RemoveAll(DirPath(baseDir, name))
}
// DirSize returns the total byte size of all files in the snapshot directory.
func DirSize(baseDir, name string) (int64, error) {
var total int64
dir := DirPath(baseDir, name)
err := filepath.WalkDir(dir, func(_ string, d fs.DirEntry, err error) error {
if err != nil {
return err
}
if d.IsDir() {
return nil
}
info, err := d.Info()
if err != nil {
return err
}
total += info.Size()
return nil
})
if err != nil {
return 0, fmt.Errorf("calculate snapshot size: %w", err)
}
return total, nil
}

View File

@ -0,0 +1,213 @@
package snapshot
import "github.com/google/uuid"
// CreateMapping converts a dirty-block bitset (represented as a []bool) into
// a sorted list of BuildMap entries. Consecutive dirty blocks are merged into
// a single entry. BuildStorageOffset tracks the sequential position in the
// compact diff file.
//
// Inspired by e2b's snapshot system (Apache 2.0, modified by Omukk).
func CreateMapping(buildID uuid.UUID, dirty []bool, blockSize int64) []*BuildMap {
var mappings []*BuildMap
var runStart int64 = -1
var runLength int64
var storageOffset uint64
for i, set := range dirty {
if !set {
if runLength > 0 {
mappings = append(mappings, &BuildMap{
Offset: uint64(runStart) * uint64(blockSize),
Length: uint64(runLength) * uint64(blockSize),
BuildID: buildID,
BuildStorageOffset: storageOffset,
})
storageOffset += uint64(runLength) * uint64(blockSize)
runLength = 0
}
runStart = -1
continue
}
if runStart < 0 {
runStart = int64(i)
runLength = 1
} else {
runLength++
}
}
if runLength > 0 {
mappings = append(mappings, &BuildMap{
Offset: uint64(runStart) * uint64(blockSize),
Length: uint64(runLength) * uint64(blockSize),
BuildID: buildID,
BuildStorageOffset: storageOffset,
})
}
return mappings
}
// MergeMappings overlays diffMapping on top of baseMapping. Where they overlap,
// diff takes priority. The result covers the entire address space.
//
// Both inputs must be sorted by Offset. The base mapping should cover the full size.
//
// Inspired by e2b's snapshot system (Apache 2.0, modified by Omukk).
func MergeMappings(baseMapping, diffMapping []*BuildMap) []*BuildMap {
if len(diffMapping) == 0 {
return baseMapping
}
// Work on a copy of baseMapping to avoid mutating the original.
baseCopy := make([]*BuildMap, len(baseMapping))
for i, m := range baseMapping {
cp := *m
baseCopy[i] = &cp
}
var result []*BuildMap
var bi, di int
for bi < len(baseCopy) && di < len(diffMapping) {
base := baseCopy[bi]
diff := diffMapping[di]
if base.Length == 0 {
bi++
continue
}
if diff.Length == 0 {
di++
continue
}
// No overlap: base entirely before diff.
if base.Offset+base.Length <= diff.Offset {
result = append(result, base)
bi++
continue
}
// No overlap: diff entirely before base.
if diff.Offset+diff.Length <= base.Offset {
result = append(result, diff)
di++
continue
}
// Base fully inside diff — skip base.
if base.Offset >= diff.Offset && base.Offset+base.Length <= diff.Offset+diff.Length {
bi++
continue
}
// Diff fully inside base — split base around diff.
if diff.Offset >= base.Offset && diff.Offset+diff.Length <= base.Offset+base.Length {
leftLen := int64(diff.Offset) - int64(base.Offset)
if leftLen > 0 {
result = append(result, &BuildMap{
Offset: base.Offset,
Length: uint64(leftLen),
BuildID: base.BuildID,
BuildStorageOffset: base.BuildStorageOffset,
})
}
result = append(result, diff)
di++
rightShift := int64(diff.Offset) + int64(diff.Length) - int64(base.Offset)
rightLen := int64(base.Length) - rightShift
if rightLen > 0 {
baseCopy[bi] = &BuildMap{
Offset: base.Offset + uint64(rightShift),
Length: uint64(rightLen),
BuildID: base.BuildID,
BuildStorageOffset: base.BuildStorageOffset + uint64(rightShift),
}
} else {
bi++
}
continue
}
// Base starts after diff with overlap — emit diff, trim base.
if base.Offset > diff.Offset {
result = append(result, diff)
di++
rightShift := int64(diff.Offset) + int64(diff.Length) - int64(base.Offset)
rightLen := int64(base.Length) - rightShift
if rightLen > 0 {
baseCopy[bi] = &BuildMap{
Offset: base.Offset + uint64(rightShift),
Length: uint64(rightLen),
BuildID: base.BuildID,
BuildStorageOffset: base.BuildStorageOffset + uint64(rightShift),
}
} else {
bi++
}
continue
}
// Diff starts after base with overlap — emit left part of base.
if diff.Offset > base.Offset {
leftLen := int64(diff.Offset) - int64(base.Offset)
if leftLen > 0 {
result = append(result, &BuildMap{
Offset: base.Offset,
Length: uint64(leftLen),
BuildID: base.BuildID,
BuildStorageOffset: base.BuildStorageOffset,
})
}
bi++
continue
}
}
// Append remaining entries.
result = append(result, baseCopy[bi:]...)
result = append(result, diffMapping[di:]...)
return result
}
// NormalizeMappings merges adjacent entries with the same BuildID.
func NormalizeMappings(mappings []*BuildMap) []*BuildMap {
if len(mappings) == 0 {
return nil
}
result := make([]*BuildMap, 0, len(mappings))
current := &BuildMap{
Offset: mappings[0].Offset,
Length: mappings[0].Length,
BuildID: mappings[0].BuildID,
BuildStorageOffset: mappings[0].BuildStorageOffset,
}
for i := 1; i < len(mappings); i++ {
m := mappings[i]
if m.BuildID == current.BuildID {
current.Length += m.Length
} else {
result = append(result, current)
current = &BuildMap{
Offset: m.Offset,
Length: m.Length,
BuildID: m.BuildID,
BuildStorageOffset: m.BuildStorageOffset,
}
}
}
result = append(result, current)
return result
}

View File

@ -0,0 +1,189 @@
package snapshot
import (
"fmt"
"io"
"os"
"github.com/google/uuid"
)
const (
// DefaultBlockSize is 4KB — standard page size for Firecracker.
DefaultBlockSize int64 = 4096
)
// ProcessMemfile reads a full memory file produced by Firecracker's
// PUT /snapshot/create, identifies non-zero blocks, and writes only those
// blocks to a compact diff file. Returns the Header describing the mapping.
//
// The output diff file contains non-zero blocks written sequentially.
// The header maps each block in the full address space to either:
// - A position in the diff file (for non-zero blocks)
// - uuid.Nil (for zero/empty blocks, served as zeros without I/O)
//
// buildID identifies this snapshot generation in the header chain.
func ProcessMemfile(memfilePath, diffPath, headerPath string, buildID uuid.UUID) (*Header, error) {
src, err := os.Open(memfilePath)
if err != nil {
return nil, fmt.Errorf("open memfile: %w", err)
}
defer src.Close()
info, err := src.Stat()
if err != nil {
return nil, fmt.Errorf("stat memfile: %w", err)
}
memSize := info.Size()
dst, err := os.Create(diffPath)
if err != nil {
return nil, fmt.Errorf("create diff file: %w", err)
}
defer dst.Close()
totalBlocks := TotalBlocks(memSize, DefaultBlockSize)
dirty := make([]bool, totalBlocks)
empty := make([]bool, totalBlocks)
buf := make([]byte, DefaultBlockSize)
for i := int64(0); i < totalBlocks; i++ {
n, err := io.ReadFull(src, buf)
if err != nil && err != io.ErrUnexpectedEOF {
return nil, fmt.Errorf("read block %d: %w", i, err)
}
// Zero-pad the last block if it's short.
if int64(n) < DefaultBlockSize {
for j := n; j < int(DefaultBlockSize); j++ {
buf[j] = 0
}
}
if isZeroBlock(buf) {
empty[i] = true
continue
}
dirty[i] = true
if _, err := dst.Write(buf); err != nil {
return nil, fmt.Errorf("write diff block %d: %w", i, err)
}
}
// Build header.
dirtyMappings := CreateMapping(buildID, dirty, DefaultBlockSize)
emptyMappings := CreateMapping(uuid.Nil, empty, DefaultBlockSize)
merged := MergeMappings(dirtyMappings, emptyMappings)
normalized := NormalizeMappings(merged)
metadata := NewMetadata(buildID, uint64(DefaultBlockSize), uint64(memSize))
header, err := NewHeader(metadata, normalized)
if err != nil {
return nil, fmt.Errorf("create header: %w", err)
}
// Write header to disk.
headerData, err := Serialize(metadata, normalized)
if err != nil {
return nil, fmt.Errorf("serialize header: %w", err)
}
if err := os.WriteFile(headerPath, headerData, 0644); err != nil {
return nil, fmt.Errorf("write header: %w", err)
}
return header, nil
}
// ProcessMemfileWithParent processes a memory file as a new generation on top
// of an existing parent header. The new diff file contains only blocks that
// differ from what the parent header maps. This is used for re-pause of a
// sandbox that was restored from a snapshot.
func ProcessMemfileWithParent(memfilePath, diffPath, headerPath string, parentHeader *Header, buildID uuid.UUID) (*Header, error) {
src, err := os.Open(memfilePath)
if err != nil {
return nil, fmt.Errorf("open memfile: %w", err)
}
defer src.Close()
info, err := src.Stat()
if err != nil {
return nil, fmt.Errorf("stat memfile: %w", err)
}
memSize := info.Size()
dst, err := os.Create(diffPath)
if err != nil {
return nil, fmt.Errorf("create diff file: %w", err)
}
defer dst.Close()
totalBlocks := TotalBlocks(memSize, DefaultBlockSize)
dirty := make([]bool, totalBlocks)
empty := make([]bool, totalBlocks)
buf := make([]byte, DefaultBlockSize)
for i := int64(0); i < totalBlocks; i++ {
n, err := io.ReadFull(src, buf)
if err != nil && err != io.ErrUnexpectedEOF {
return nil, fmt.Errorf("read block %d: %w", i, err)
}
if int64(n) < DefaultBlockSize {
for j := n; j < int(DefaultBlockSize); j++ {
buf[j] = 0
}
}
if isZeroBlock(buf) {
empty[i] = true
continue
}
dirty[i] = true
if _, err := dst.Write(buf); err != nil {
return nil, fmt.Errorf("write diff block %d: %w", i, err)
}
}
// Build new generation header merged with parent.
dirtyMappings := CreateMapping(buildID, dirty, DefaultBlockSize)
emptyMappings := CreateMapping(uuid.Nil, empty, DefaultBlockSize)
diffMapping := MergeMappings(dirtyMappings, emptyMappings)
merged := MergeMappings(parentHeader.Mapping, diffMapping)
normalized := NormalizeMappings(merged)
metadata := parentHeader.Metadata.NextGeneration(buildID)
header, err := NewHeader(metadata, normalized)
if err != nil {
return nil, fmt.Errorf("create header: %w", err)
}
headerData, err := Serialize(metadata, normalized)
if err != nil {
return nil, fmt.Errorf("serialize header: %w", err)
}
if err := os.WriteFile(headerPath, headerData, 0644); err != nil {
return nil, fmt.Errorf("write header: %w", err)
}
return header, nil
}
// isZeroBlock checks if a block is entirely zero bytes.
func isZeroBlock(block []byte) bool {
// Fast path: compare 8 bytes at a time.
for i := 0; i+8 <= len(block); i += 8 {
if block[i] != 0 || block[i+1] != 0 || block[i+2] != 0 || block[i+3] != 0 ||
block[i+4] != 0 || block[i+5] != 0 || block[i+6] != 0 || block[i+7] != 0 {
return false
}
}
// Tail bytes.
for i := len(block) &^ 7; i < len(block); i++ {
if block[i] != 0 {
return false
}
}
return true
}

87
internal/uffd/fd.go Normal file
View File

@ -0,0 +1,87 @@
// Package uffd implements a userfaultfd-based memory server for Firecracker
// snapshot restore. When a VM is restored from a snapshot, instead of loading
// the entire memory file upfront, the UFFD handler intercepts page faults
// and serves memory pages on demand from the snapshot's compact diff file.
//
// Inspired by e2b's UFFD implementation (Apache 2.0, modified by Omukk).
package uffd
/*
#include <sys/syscall.h>
#include <fcntl.h>
#include <linux/userfaultfd.h>
#include <sys/ioctl.h>
struct uffd_pagefault {
__u64 flags;
__u64 address;
__u32 ptid;
};
*/
import "C"
import (
"fmt"
"syscall"
"unsafe"
)
const (
UFFD_EVENT_PAGEFAULT = C.UFFD_EVENT_PAGEFAULT
UFFD_PAGEFAULT_FLAG_WRITE = C.UFFD_PAGEFAULT_FLAG_WRITE
UFFDIO_COPY = C.UFFDIO_COPY
UFFDIO_COPY_MODE_WP = C.UFFDIO_COPY_MODE_WP
)
type (
uffdMsg = C.struct_uffd_msg
uffdPagefault = C.struct_uffd_pagefault
uffdioCopy = C.struct_uffdio_copy
)
// fd wraps a userfaultfd file descriptor received from Firecracker.
type fd uintptr
// copy installs a page into guest memory at the given address using UFFDIO_COPY.
// mode controls write-protection: use UFFDIO_COPY_MODE_WP to preserve WP bit.
func (f fd) copy(addr, pagesize uintptr, data []byte, mode C.ulonglong) error {
alignedAddr := addr &^ (pagesize - 1)
cpy := uffdioCopy{
src: C.ulonglong(uintptr(unsafe.Pointer(&data[0]))),
dst: C.ulonglong(alignedAddr),
len: C.ulonglong(pagesize),
mode: mode,
copy: 0,
}
_, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(f), UFFDIO_COPY, uintptr(unsafe.Pointer(&cpy)))
if errno != 0 {
return errno
}
if cpy.copy != C.longlong(pagesize) {
return fmt.Errorf("UFFDIO_COPY copied %d bytes, expected %d", cpy.copy, pagesize)
}
return nil
}
// close closes the userfaultfd file descriptor.
func (f fd) close() error {
return syscall.Close(int(f))
}
// getMsgEvent extracts the event type from a uffd_msg.
func getMsgEvent(msg *uffdMsg) C.uchar {
return msg.event
}
// getMsgArg extracts the arg union from a uffd_msg.
func getMsgArg(msg *uffdMsg) [24]byte {
return msg.arg
}
// getPagefaultAddress extracts the faulting address from a uffd_pagefault.
func getPagefaultAddress(pf *uffdPagefault) uintptr {
return uintptr(pf.address)
}

35
internal/uffd/region.go Normal file
View File

@ -0,0 +1,35 @@
package uffd
import "fmt"
// Region is a mapping of guest memory to host virtual address space.
// Firecracker sends these as JSON when connecting to the UFFD socket.
// The JSON field names match Firecracker's UFFD protocol.
type Region struct {
BaseHostVirtAddr uintptr `json:"base_host_virt_addr"`
Size uintptr `json:"size"`
Offset uintptr `json:"offset"`
PageSize uintptr `json:"page_size_kib"` // Actually in bytes despite the name.
}
// Mapping translates between host virtual addresses and logical memory offsets.
type Mapping struct {
Regions []Region
}
// NewMapping creates a Mapping from a list of regions.
func NewMapping(regions []Region) *Mapping {
return &Mapping{Regions: regions}
}
// GetOffset converts a host virtual address to a logical memory file offset
// and returns the page size. This is called on every UFFD page fault.
func (m *Mapping) GetOffset(hostVirtAddr uintptr) (int64, uintptr, error) {
for _, r := range m.Regions {
if hostVirtAddr >= r.BaseHostVirtAddr && hostVirtAddr < r.BaseHostVirtAddr+r.Size {
offset := int64(hostVirtAddr-r.BaseHostVirtAddr) + int64(r.Offset)
return offset, r.PageSize, nil
}
}
return 0, 0, fmt.Errorf("address %#x not found in any memory region", hostVirtAddr)
}

352
internal/uffd/server.go Normal file
View File

@ -0,0 +1,352 @@
package uffd
/*
#include <linux/userfaultfd.h>
*/
import "C"
import (
"context"
"encoding/json"
"errors"
"fmt"
"log/slog"
"net"
"os"
"sync"
"syscall"
"unsafe"
"golang.org/x/sys/unix"
"git.omukk.dev/wrenn/sandbox/internal/snapshot"
)
const (
fdSize = 4
regionMappingsSize = 1024
maxConcurrentFaults = 4096
)
// MemorySource provides page data for the UFFD handler.
// Given a logical memory offset and a size, it returns the page data.
type MemorySource interface {
ReadPage(ctx context.Context, offset int64, size int64) ([]byte, error)
}
// Server manages the UFFD Unix socket lifecycle and page fault handling
// for a single Firecracker snapshot restore.
type Server struct {
socketPath string
source MemorySource
lis *net.UnixListener
readyCh chan struct{}
readyOnce sync.Once
doneCh chan struct{}
doneErr error
// exitPipe signals the poll loop to stop.
exitR *os.File
exitW *os.File
}
// NewServer creates a UFFD server that will listen on the given socket path
// and serve memory pages from the given source.
func NewServer(socketPath string, source MemorySource) *Server {
return &Server{
socketPath: socketPath,
source: source,
readyCh: make(chan struct{}),
doneCh: make(chan struct{}),
}
}
// Start begins listening on the Unix socket. Firecracker will connect to this
// socket after loadSnapshot is called with the UFFD backend.
// Start returns immediately; the server runs in a background goroutine.
func (s *Server) Start(ctx context.Context) error {
lis, err := net.ListenUnix("unix", &net.UnixAddr{Name: s.socketPath, Net: "unix"})
if err != nil {
return fmt.Errorf("listen on uffd socket: %w", err)
}
s.lis = lis
if err := os.Chmod(s.socketPath, 0o777); err != nil {
lis.Close()
return fmt.Errorf("chmod uffd socket: %w", err)
}
// Create exit signal pipe.
r, w, err := os.Pipe()
if err != nil {
lis.Close()
return fmt.Errorf("create exit pipe: %w", err)
}
s.exitR = r
s.exitW = w
go func() {
defer close(s.doneCh)
s.doneErr = s.handle(ctx)
s.lis.Close()
s.exitR.Close()
s.exitW.Close()
s.readyOnce.Do(func() { close(s.readyCh) })
}()
return nil
}
// Ready returns a channel that is closed when the UFFD handler is ready
// (after Firecracker has connected and sent the uffd fd).
func (s *Server) Ready() <-chan struct{} {
return s.readyCh
}
// Stop signals the UFFD poll loop to exit and waits for it to finish.
func (s *Server) Stop() error {
// Write a byte to the exit pipe to wake the poll loop.
s.exitW.Write([]byte{0})
<-s.doneCh
return s.doneErr
}
// Wait blocks until the server exits.
func (s *Server) Wait() error {
<-s.doneCh
return s.doneErr
}
// handle accepts the Firecracker connection, receives the UFFD fd via
// SCM_RIGHTS, and runs the page fault poll loop.
func (s *Server) handle(ctx context.Context) error {
conn, err := s.lis.Accept()
if err != nil {
return fmt.Errorf("accept uffd connection: %w", err)
}
unixConn := conn.(*net.UnixConn)
defer unixConn.Close()
// Read the memory region mappings (JSON) and the UFFD fd (SCM_RIGHTS).
regionBuf := make([]byte, regionMappingsSize)
uffdBuf := make([]byte, syscall.CmsgSpace(fdSize))
nRegion, nFd, _, _, err := unixConn.ReadMsgUnix(regionBuf, uffdBuf)
if err != nil {
return fmt.Errorf("read uffd message: %w", err)
}
var regions []Region
if err := json.Unmarshal(regionBuf[:nRegion], &regions); err != nil {
return fmt.Errorf("parse memory regions: %w", err)
}
controlMsgs, err := syscall.ParseSocketControlMessage(uffdBuf[:nFd])
if err != nil {
return fmt.Errorf("parse control messages: %w", err)
}
if len(controlMsgs) != 1 {
return fmt.Errorf("expected 1 control message, got %d", len(controlMsgs))
}
fds, err := syscall.ParseUnixRights(&controlMsgs[0])
if err != nil {
return fmt.Errorf("parse unix rights: %w", err)
}
if len(fds) != 1 {
return fmt.Errorf("expected 1 fd, got %d", len(fds))
}
uffdFd := fd(fds[0])
defer uffdFd.close()
mapping := NewMapping(regions)
slog.Info("uffd handler connected",
"regions", len(regions),
"fd", int(uffdFd),
)
// Signal readiness.
s.readyOnce.Do(func() { close(s.readyCh) })
// Run the poll loop.
return s.serve(ctx, uffdFd, mapping)
}
// serve is the main poll loop. It polls the UFFD fd for page fault events
// and the exit pipe for shutdown signals.
func (s *Server) serve(ctx context.Context, uffdFd fd, mapping *Mapping) error {
pollFds := []unix.PollFd{
{Fd: int32(uffdFd), Events: unix.POLLIN},
{Fd: int32(s.exitR.Fd()), Events: unix.POLLIN},
}
var wg sync.WaitGroup
sem := make(chan struct{}, maxConcurrentFaults)
// Always wait for in-flight goroutines before returning, so the caller
// can safely close the uffd fd after serve returns.
defer wg.Wait()
for {
if _, err := unix.Poll(pollFds, -1); err != nil {
if err == unix.EINTR || err == unix.EAGAIN {
continue
}
return fmt.Errorf("poll: %w", err)
}
// Check exit signal.
if pollFds[1].Revents&unix.POLLIN != 0 {
return nil
}
if pollFds[0].Revents&unix.POLLIN == 0 {
continue
}
// Read the uffd_msg. The fd is O_NONBLOCK (set by Firecracker),
// so EAGAIN is expected — just go back to poll.
buf := make([]byte, unsafe.Sizeof(uffdMsg{}))
n, err := readUffdMsg(uffdFd, buf)
if err == syscall.EAGAIN {
continue
}
if err != nil {
return fmt.Errorf("read uffd msg: %w", err)
}
if n == 0 {
continue
}
msg := *(*uffdMsg)(unsafe.Pointer(&buf[0]))
if getMsgEvent(&msg) != UFFD_EVENT_PAGEFAULT {
return fmt.Errorf("unexpected uffd event type: %d", getMsgEvent(&msg))
}
arg := getMsgArg(&msg)
pf := *(*uffdPagefault)(unsafe.Pointer(&arg[0]))
addr := getPagefaultAddress(&pf)
offset, pagesize, err := mapping.GetOffset(addr)
if err != nil {
return fmt.Errorf("resolve address %#x: %w", addr, err)
}
sem <- struct{}{}
wg.Add(1)
go func() {
defer wg.Done()
defer func() { <-sem }()
if err := s.faultPage(ctx, uffdFd, addr, offset, pagesize); err != nil {
slog.Error("uffd fault page error",
"addr", fmt.Sprintf("%#x", addr),
"offset", offset,
"error", err,
)
}
}()
}
}
// readUffdMsg reads a single uffd_msg, retrying on EINTR.
// Returns (n, EAGAIN) if the non-blocking read has nothing available.
func readUffdMsg(uffdFd fd, buf []byte) (int, error) {
for {
n, err := syscall.Read(int(uffdFd), buf)
if err == syscall.EINTR {
continue
}
return n, err
}
}
// faultPage fetches a page from the memory source and copies it into
// guest memory via UFFDIO_COPY.
func (s *Server) faultPage(ctx context.Context, uffdFd fd, addr uintptr, offset int64, pagesize uintptr) error {
data, err := s.source.ReadPage(ctx, offset, int64(pagesize))
if err != nil {
return fmt.Errorf("read page at offset %d: %w", offset, err)
}
// Mode 0: no write-protect. Standard Firecracker does not register
// UFFD ranges with WP support, so UFFDIO_COPY_MODE_WP would fail.
if err := uffdFd.copy(addr, pagesize, data, 0); err != nil {
if errors.Is(err, unix.EEXIST) {
// Page already mapped (race with prefetch or concurrent fault).
return nil
}
return fmt.Errorf("uffdio_copy: %w", err)
}
return nil
}
// DiffFileSource serves pages from a snapshot's compact diff file using
// the header's block mapping to resolve offsets.
type DiffFileSource struct {
header *snapshot.Header
// diffs maps build ID → open file handle for each generation's diff file.
diffs map[string]*os.File
}
// NewDiffFileSource creates a memory source backed by snapshot diff files.
// diffs maps build ID string to the file path of each generation's diff file.
func NewDiffFileSource(header *snapshot.Header, diffPaths map[string]string) (*DiffFileSource, error) {
diffs := make(map[string]*os.File, len(diffPaths))
for id, path := range diffPaths {
f, err := os.Open(path)
if err != nil {
// Close already opened files.
for _, opened := range diffs {
opened.Close()
}
return nil, fmt.Errorf("open diff file %s: %w", path, err)
}
diffs[id] = f
}
return &DiffFileSource{header: header, diffs: diffs}, nil
}
// ReadPage resolves a memory offset through the header mapping and reads
// the corresponding page from the correct generation's diff file.
func (s *DiffFileSource) ReadPage(ctx context.Context, offset int64, size int64) ([]byte, error) {
mappedOffset, _, buildID, err := s.header.GetShiftedMapping(ctx, offset)
if err != nil {
return nil, fmt.Errorf("resolve offset %d: %w", offset, err)
}
// uuid.Nil means zero-fill (empty page).
var nilUUID [16]byte
if *buildID == nilUUID {
return make([]byte, size), nil
}
f, ok := s.diffs[buildID.String()]
if !ok {
return nil, fmt.Errorf("no diff file for build %s", buildID)
}
buf := make([]byte, size)
n, err := f.ReadAt(buf, mappedOffset)
if err != nil && int64(n) < size {
return nil, fmt.Errorf("read diff at offset %d: %w", mappedOffset, err)
}
return buf, nil
}
// Close closes all open diff file handles.
func (s *DiffFileSource) Close() error {
var errs []error
for _, f := range s.diffs {
if err := f.Close(); err != nil {
errs = append(errs, err)
}
}
return errors.Join(errs...)
}

View File

@ -70,7 +70,7 @@ func (c *VMConfig) applyDefaults() {
c.SocketPath = fmt.Sprintf("/tmp/fc-%s.sock", c.SandboxID)
}
if c.SandboxDir == "" {
c.SandboxDir = fmt.Sprintf("/tmp/fc-sandbox-%s", c.SandboxID)
c.SandboxDir = "/tmp/fc-vm"
}
if c.TapDevice == "" {
c.TapDevice = "tap0"

View File

@ -95,7 +95,7 @@ func (c *fcClient) setNetworkInterface(ctx context.Context, ifaceID, tapName, ma
// setMachineConfig configures vCPUs, memory, and other machine settings.
func (c *fcClient) setMachineConfig(ctx context.Context, vcpus, memMB int) error {
return c.do(ctx, http.MethodPut, "/machine-config", map[string]any{
"vcpu_count": vcpus,
"vcpu_count": vcpus,
"mem_size_mib": memMB,
"smt": false,
})
@ -131,7 +131,7 @@ func (c *fcClient) createSnapshot(ctx context.Context, snapPath, memPath string)
})
}
// loadSnapshot loads a VM snapshot.
// loadSnapshot loads a VM snapshot from a file-backed memory image.
func (c *fcClient) loadSnapshot(ctx context.Context, snapPath, memPath string) error {
return c.do(ctx, http.MethodPut, "/snapshot/load", map[string]any{
"snapshot_path": snapPath,
@ -139,3 +139,17 @@ func (c *fcClient) loadSnapshot(ctx context.Context, snapPath, memPath string) e
"resume_vm": false,
})
}
// loadSnapshotWithUffd loads a VM snapshot using a UFFD socket for
// lazy memory loading. Firecracker will connect to the socket and
// send the uffd fd + memory region mappings.
func (c *fcClient) loadSnapshotWithUffd(ctx context.Context, snapPath, uffdSocketPath string) error {
return c.do(ctx, http.MethodPut, "/snapshot/load", map[string]any{
"snapshot_path": snapPath,
"resume_vm": false,
"mem_backend": map[string]any{
"backend_type": "Uffd",
"backend_path": uffdSocketPath,
},
})
}

View File

@ -91,8 +91,10 @@ func configureVM(ctx context.Context, client *fcClient, cfg *VMConfig) error {
return fmt.Errorf("set boot source: %w", err)
}
// Root drive
if err := client.setRootfsDrive(ctx, "rootfs", cfg.RootfsPath, false); err != nil {
// Root drive — use the symlink path inside the mount namespace so that
// snapshots record a stable path that works on restore.
rootfsSymlink := cfg.SandboxDir + "/rootfs.ext4"
if err := client.setRootfsDrive(ctx, "rootfs", rootfsSymlink, false); err != nil {
return fmt.Errorf("set rootfs drive: %w", err)
}
@ -162,6 +164,92 @@ func (m *Manager) Destroy(ctx context.Context, sandboxID string) error {
return nil
}
// Snapshot creates a full VM snapshot. The VM must already be paused.
// Produces a snapfile (VM state) and a memfile (full memory dump).
func (m *Manager) Snapshot(ctx context.Context, sandboxID, snapPath, memPath string) error {
vm, ok := m.vms[sandboxID]
if !ok {
return fmt.Errorf("VM not found: %s", sandboxID)
}
if err := vm.client.createSnapshot(ctx, snapPath, memPath); err != nil {
return fmt.Errorf("create snapshot: %w", err)
}
slog.Info("VM snapshot created", "sandbox", sandboxID, "snap_path", snapPath)
return nil
}
// CreateFromSnapshot boots a new Firecracker VM by loading a snapshot
// using UFFD for lazy memory loading. The network namespace and TAP
// device must already be set up.
//
// No boot resources (kernel, drives, machine config) are configured —
// the snapshot carries all that state. The rootfs path recorded in the
// snapshot is resolved via a stable symlink at SandboxDir/rootfs.ext4
// inside the mount namespace (created by the start script in jailer.go).
//
// The sequence is:
// 1. Start FC process in mount+network namespace (creates tmpfs + rootfs symlink)
// 2. Wait for API socket
// 3. Load snapshot with UFFD backend
// 4. Resume VM execution
func (m *Manager) CreateFromSnapshot(ctx context.Context, cfg VMConfig, snapPath, uffdSocketPath string) (*VM, error) {
cfg.applyDefaults()
if err := cfg.validate(); err != nil {
return nil, fmt.Errorf("invalid config: %w", err)
}
os.Remove(cfg.SocketPath)
slog.Info("restoring VM from snapshot",
"sandbox", cfg.SandboxID,
"snap_path", snapPath,
)
// Step 1: Launch the Firecracker process.
// The start script creates a tmpfs at SandboxDir and symlinks
// rootfs.ext4 → cfg.RootfsPath, so the snapshot's recorded rootfs
// path (/fc-vm/rootfs.ext4) resolves to the new clone.
proc, err := startProcess(ctx, &cfg)
if err != nil {
return nil, fmt.Errorf("start process: %w", err)
}
// Step 2: Wait for the API socket.
if err := waitForSocket(ctx, cfg.SocketPath, proc); err != nil {
proc.stop()
return nil, fmt.Errorf("wait for socket: %w", err)
}
client := newFCClient(cfg.SocketPath)
// Step 3: Load the snapshot with UFFD backend.
// No boot resources are configured — the snapshot carries kernel,
// drive, network, and machine config state.
if err := client.loadSnapshotWithUffd(ctx, snapPath, uffdSocketPath); err != nil {
proc.stop()
return nil, fmt.Errorf("load snapshot: %w", err)
}
// Step 4: Resume the VM.
if err := client.resumeVM(ctx); err != nil {
proc.stop()
return nil, fmt.Errorf("resume VM: %w", err)
}
vm := &VM{
Config: cfg,
process: proc,
client: client,
}
m.vms[cfg.SandboxID] = vm
slog.Info("VM restored from snapshot", "sandbox", cfg.SandboxID)
return vm, nil
}
// Get returns a running VM by sandbox ID.
func (m *Manager) Get(sandboxID string) (*VM, bool) {
vm, ok := m.vms[sandboxID]