1
0
forked from wrenn/wrenn

feat(vm): replace Firecracker with Cloud Hypervisor

Migrate the entire VM layer from Firecracker to Cloud Hypervisor (CH).
CH provides native snapshot/restore via its HTTP API, eliminating the
need for custom UFFD handling, memfile processing, and snapshot header
management that Firecracker required.

Key changes:
- Remove fc.go, jailer.go (FC process management)
- Remove internal/uffd/ package (userfaultfd lazy page loading)
- Remove snapshot/header.go, mapping.go, memfile.go (FC snapshot format)
- Add ch.go (CH HTTP API client over Unix socket)
- Add process.go (CH process lifecycle with unshare+netns)
- Add chversion.go (CH version detection)
- Refactor sandbox manager: remove UFFD socket tracking, snapshot
  parent/diff chaining, FC-specific balloon logic; add crash watcher
- Simplify snapshot/local.go to CH's native snapshot format
- Update VM config: FirecrackerBin → VMMBin, new CH-specific fields
- Update envdclient, devicemapper, network for CH compatibility
This commit is contained in:
2026-05-17 01:33:12 +06:00
parent c2dc382787
commit eaa6b8576d
25 changed files with 754 additions and 2267 deletions

View File

@ -126,22 +126,22 @@ func main() {
} }
slog.Info("resolved kernel", "version", kernelVersion, "path", kernelPath) slog.Info("resolved kernel", "version", kernelVersion, "path", kernelPath)
// Detect firecracker version. // Detect cloud-hypervisor version.
fcBin := envOrDefault("WRENN_FIRECRACKER_BIN", "/usr/local/bin/firecracker") chBin := envOrDefault("WRENN_CH_BIN", "/usr/local/bin/cloud-hypervisor")
fcVersion, err := sandbox.DetectFirecrackerVersion(fcBin) chVersion, err := sandbox.DetectCHVersion(chBin)
if err != nil { if err != nil {
slog.Error("failed to detect firecracker version", "error", err) slog.Error("failed to detect cloud-hypervisor version", "error", err)
os.Exit(1) os.Exit(1)
} }
slog.Info("resolved firecracker", "version", fcVersion, "path", fcBin) slog.Info("resolved cloud-hypervisor", "version", chVersion, "path", chBin)
cfg := sandbox.Config{ cfg := sandbox.Config{
WrennDir: rootDir, WrennDir: rootDir,
DefaultRootfsSizeMB: defaultRootfsSizeMB, DefaultRootfsSizeMB: defaultRootfsSizeMB,
KernelPath: kernelPath, KernelPath: kernelPath,
KernelVersion: kernelVersion, KernelVersion: kernelVersion,
FirecrackerBin: fcBin, VMMBin: chBin,
FirecrackerVersion: fcVersion, VMMVersion: chVersion,
AgentVersion: version, AgentVersion: version,
} }
@ -245,12 +245,16 @@ func main() {
}, },
) )
// Graceful shutdown on SIGINT/SIGTERM. // Graceful shutdown on SIGINT/SIGTERM. A second signal force-exits
// so the operator can always kill the process if shutdown hangs.
sigCh := make(chan os.Signal, 1) sigCh := make(chan os.Signal, 1)
signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM) signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM)
go func() { go func() {
sig := <-sigCh sig := <-sigCh
doShutdown("signal: " + sig.String()) go doShutdown("signal: " + sig.String())
sig = <-sigCh
slog.Error("received second signal, force exiting", "signal", sig.String())
os.Exit(1)
}() }()
slog.Info("host agent starting", "addr", listenAddr, "host_id", creds.HostID, "version", version, "commit", commit) slog.Info("host agent starting", "addr", listenAddr, "host_id", creds.HostID, "version", version, "commit", commit)
@ -292,7 +296,7 @@ func checkPrivileges() error {
name string name string
}{ }{
{1, "CAP_DAC_OVERRIDE"}, // /dev/loop*, /dev/mapper/*, /dev/net/tun {1, "CAP_DAC_OVERRIDE"}, // /dev/loop*, /dev/mapper/*, /dev/net/tun
{5, "CAP_KILL"}, // SIGTERM/SIGKILL to Firecracker processes {5, "CAP_KILL"}, // SIGTERM/SIGKILL to cloud-hypervisor processes
{12, "CAP_NET_ADMIN"}, // netlink, iptables, routing, TAP/veth {12, "CAP_NET_ADMIN"}, // netlink, iptables, routing, TAP/veth
{13, "CAP_NET_RAW"}, // raw sockets (iptables) {13, "CAP_NET_RAW"}, // raw sockets (iptables)
{19, "CAP_SYS_PTRACE"}, // reading /proc/self/ns/net (netns.Get) {19, "CAP_SYS_PTRACE"}, // reading /proc/self/ns/net (netns.Get)

View File

@ -19,6 +19,12 @@ import (
// it is considered unreachable (3 missed 30-second heartbeats). // it is considered unreachable (3 missed 30-second heartbeats).
const unreachableThreshold = 90 * time.Second const unreachableThreshold = 90 * time.Second
// transientGracePeriod is how long a sandbox is allowed to stay in a transient
// status (starting, resuming, pausing, stopping) before the monitor infers a
// final state. This prevents the monitor from racing against in-flight RPCs
// that may not have registered the sandbox on the host agent yet.
const transientGracePeriod = 2 * time.Minute
// HostMonitor runs on a fixed interval and performs two duties: // HostMonitor runs on a fixed interval and performs two duties:
// //
// 1. Passive check: marks hosts whose last_heartbeat_at is stale as // 1. Passive check: marks hosts whose last_heartbeat_at is stale as
@ -257,7 +263,16 @@ func (m *HostMonitor) checkHost(ctx context.Context, host db.Host) {
} }
continue continue
} }
// Sandbox is not alive on host — infer final state. // Sandbox is not alive on host. If the transition is recent, give the
// in-flight RPC time to finish before declaring a final state.
if sb.LastUpdated.Valid && time.Since(sb.LastUpdated.Time) < transientGracePeriod {
slog.Debug("host monitor: transient sandbox still within grace period",
"sandbox_id", sbIDStr, "status", sb.Status,
"age", time.Since(sb.LastUpdated.Time).Round(time.Second))
continue
}
// Grace period expired — infer final state.
var finalStatus string var finalStatus string
switch sb.Status { switch sb.Status {
case "starting", "resuming": case "starting", "resuming":

View File

@ -42,6 +42,7 @@ const (
SandboxEventResumed = "sandbox.resumed" SandboxEventResumed = "sandbox.resumed"
SandboxEventStopped = "sandbox.stopped" SandboxEventStopped = "sandbox.stopped"
SandboxEventFailed = "sandbox.failed" SandboxEventFailed = "sandbox.failed"
SandboxEventError = "sandbox.error"
SandboxEventAutoPaused = "sandbox.auto_paused" SandboxEventAutoPaused = "sandbox.auto_paused"
) )
@ -141,7 +142,7 @@ func (c *SandboxEventConsumer) handleMessage(ctx context.Context, msg redis.XMes
c.handlePaused(ctx, sandboxID, event) c.handlePaused(ctx, sandboxID, event)
case SandboxEventStopped: case SandboxEventStopped:
c.handleStopped(ctx, sandboxID, event) c.handleStopped(ctx, sandboxID, event)
case SandboxEventFailed: case SandboxEventFailed, SandboxEventError:
c.handleFailed(ctx, sandboxID) c.handleFailed(ctx, sandboxID)
case SandboxEventAutoPaused: case SandboxEventAutoPaused:
c.handleAutoPaused(ctx, sandboxID, event) c.handleAutoPaused(ctx, sandboxID, event)
@ -187,20 +188,39 @@ func (c *SandboxEventConsumer) handlePaused(ctx context.Context, sandboxID pgtyp
} }
func (c *SandboxEventConsumer) handleStopped(ctx context.Context, sandboxID pgtype.UUID, event SandboxEvent) { func (c *SandboxEventConsumer) handleStopped(ctx context.Context, sandboxID pgtype.UUID, event SandboxEvent) {
// Try stopping → stopped (CP-initiated destroy completed).
if _, err := c.db.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{ if _, err := c.db.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{
ID: sandboxID, ID: sandboxID,
Status: "stopping", Status: "stopping",
Status_2: "stopped", Status_2: "stopped",
}); err == nil {
return
}
// Try running → stopped (autonomous destroy, e.g. TTL auto-destroy).
if _, err := c.db.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{
ID: sandboxID,
Status: "running",
Status_2: "stopped",
}); err != nil && !errors.Is(err, pgx.ErrNoRows) { }); err != nil && !errors.Is(err, pgx.ErrNoRows) {
slog.Warn("sandbox event consumer: failed to update sandbox to stopped", "sandbox_id", event.SandboxID, "error", err) slog.Warn("sandbox event consumer: failed to update sandbox to stopped", "sandbox_id", event.SandboxID, "error", err)
} }
} }
// handleFailed is a no-op fallback — the background goroutine already // handleFailed marks a sandbox as "error" when the host agent reports a crash
// performed the conditional DB update before publishing this event. // or the CP's background goroutine publishes a failure. Uses conditional update
// We keep the case arm so unknown event types are flagged, but avoid // to avoid clobbering concurrent operations.
// an unconditional status write that could clobber concurrent operations. func (c *SandboxEventConsumer) handleFailed(ctx context.Context, sandboxID pgtype.UUID) {
func (c *SandboxEventConsumer) handleFailed(_ context.Context, _ pgtype.UUID) {} // Try running → error (VM crash pushed by host agent).
if _, err := c.db.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{
ID: sandboxID, Status: "running", Status_2: "error",
}); err == nil {
return
}
// Try starting → error (create failed).
_, _ = c.db.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{
ID: sandboxID, Status: "starting", Status_2: "error",
})
}
func (c *SandboxEventConsumer) handleAutoPaused(ctx context.Context, sandboxID pgtype.UUID, _ SandboxEvent) { func (c *SandboxEventConsumer) handleAutoPaused(ctx context.Context, sandboxID pgtype.UUID, _ SandboxEvent) {
sb, err := c.db.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{ sb, err := c.db.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{

View File

@ -80,8 +80,8 @@ func (r *LoopRegistry) Release(imagePath string) {
e.refcount-- e.refcount--
if e.refcount <= 0 { if e.refcount <= 0 {
if err := losetupDetach(e.device); err != nil { if err := losetupDetachRetry(e.device); err != nil {
slog.Warn("losetup detach failed", "device", e.device, "error", err) slog.Error("losetup detach failed, loop device leaked", "device", e.device, "image", imagePath, "error", err)
} }
delete(r.entries, imagePath) delete(r.entries, imagePath)
slog.Info("loop device released", "image", imagePath, "device", e.device) slog.Info("loop device released", "image", imagePath, "device", e.device)
@ -94,8 +94,8 @@ func (r *LoopRegistry) ReleaseAll() {
defer r.mu.Unlock() defer r.mu.Unlock()
for path, e := range r.entries { for path, e := range r.entries {
if err := losetupDetach(e.device); err != nil { if err := losetupDetachRetry(e.device); err != nil {
slog.Warn("losetup detach failed", "device", e.device, "error", err) slog.Error("losetup detach failed during shutdown", "device", e.device, "image", path, "error", err)
} }
delete(r.entries, path) delete(r.entries, path)
} }
@ -134,8 +134,8 @@ func CreateSnapshot(name, originLoopDev, cowPath string, originSizeBytes, cowSiz
// space to store all modified blocks (it's sparse, so 20GB costs nothing). // space to store all modified blocks (it's sparse, so 20GB costs nothing).
sectors := originSizeBytes / 512 sectors := originSizeBytes / 512
if err := dmsetupCreate(name, originLoopDev, cowLoopDev, sectors); err != nil { if err := dmsetupCreate(name, originLoopDev, cowLoopDev, sectors); err != nil {
if detachErr := losetupDetach(cowLoopDev); detachErr != nil { if detachErr := losetupDetachRetry(cowLoopDev); detachErr != nil {
slog.Warn("cow losetup detach failed during cleanup", "device", cowLoopDev, "error", detachErr) slog.Error("cow losetup detach failed during cleanup, loop device leaked", "device", cowLoopDev, "error", detachErr)
} }
os.Remove(cowPath) os.Remove(cowPath)
return nil, fmt.Errorf("dmsetup create: %w", err) return nil, fmt.Errorf("dmsetup create: %w", err)
@ -178,8 +178,8 @@ func RestoreSnapshot(ctx context.Context, name, originLoopDev, cowPath string, o
sectors := originSizeBytes / 512 sectors := originSizeBytes / 512
if err := dmsetupCreate(name, originLoopDev, cowLoopDev, sectors); err != nil { if err := dmsetupCreate(name, originLoopDev, cowLoopDev, sectors); err != nil {
if detachErr := losetupDetach(cowLoopDev); detachErr != nil { if detachErr := losetupDetachRetry(cowLoopDev); detachErr != nil {
slog.Warn("cow losetup detach failed during cleanup", "device", cowLoopDev, "error", detachErr) slog.Error("cow losetup detach failed during cleanup, loop device leaked", "device", cowLoopDev, "error", detachErr)
} }
return nil, fmt.Errorf("dmsetup create: %w", err) return nil, fmt.Errorf("dmsetup create: %w", err)
} }
@ -208,8 +208,8 @@ func RemoveSnapshot(ctx context.Context, dev *SnapshotDevice) error {
return fmt.Errorf("dmsetup remove %s: %w", dev.Name, err) return fmt.Errorf("dmsetup remove %s: %w", dev.Name, err)
} }
if err := losetupDetach(dev.CowLoopDev); err != nil { if err := losetupDetachRetry(dev.CowLoopDev); err != nil {
slog.Warn("cow losetup detach failed", "device", dev.CowLoopDev, "error", err) return fmt.Errorf("detach cow loop %s: %w", dev.CowLoopDev, err)
} }
slog.Info("dm-snapshot removed", "name", dev.Name) slog.Info("dm-snapshot removed", "name", dev.Name)
@ -297,6 +297,24 @@ func losetupDetach(dev string) error {
return exec.Command("losetup", "-d", dev).Run() return exec.Command("losetup", "-d", dev).Run()
} }
// losetupDetachRetry detaches a loop device with retries for transient
// "device busy" errors (kernel may still hold references briefly after
// dm-snapshot removal).
func losetupDetachRetry(dev string) error {
var lastErr error
for attempt := range 5 {
if attempt > 0 {
time.Sleep(200 * time.Millisecond)
}
if err := losetupDetach(dev); err == nil {
return nil
} else {
lastErr = err
}
}
return fmt.Errorf("after 5 attempts: %w", lastErr)
}
// dmsetupCreate creates a dm-snapshot device with persistent metadata. // dmsetupCreate creates a dm-snapshot device with persistent metadata.
func dmsetupCreate(name, originDev, cowDev string, sectors int64) error { func dmsetupCreate(name, originDev, cowDev string, sectors int64) error {
// Table format: <start> <size> snapshot <origin> <cow> P <chunk_size> // Table format: <start> <size> snapshot <origin> <cow> P <chunk_size>
@ -316,7 +334,7 @@ func dmDeviceExists(name string) bool {
// dmsetupRemove removes a device-mapper device, retrying on transient // dmsetupRemove removes a device-mapper device, retrying on transient
// "device busy" errors that occur when the kernel hasn't fully released // "device busy" errors that occur when the kernel hasn't fully released
// the device after a Firecracker process exits. // the device after a VMM process exits.
func dmsetupRemove(ctx context.Context, name string) error { func dmsetupRemove(ctx context.Context, name string) error {
var lastErr error var lastErr error
for attempt := range 5 { for attempt := range 5 {

View File

@ -294,7 +294,7 @@ func (c *Client) ReadFile(ctx context.Context, path string) ([]byte, error) {
// PrepareSnapshot calls envd's POST /snapshot/prepare endpoint, which stops // PrepareSnapshot calls envd's POST /snapshot/prepare endpoint, which stops
// the port scanner/forwarder and marks active connections for post-restore // the port scanner/forwarder and marks active connections for post-restore
// cleanup before Firecracker freezes vCPUs. // cleanup before the VMM freezes vCPUs.
// //
// Best-effort: the caller should log a warning on error but not abort the pause. // Best-effort: the caller should log a warning on error but not abort the pause.
func (c *Client) PrepareSnapshot(ctx context.Context) error { func (c *Client) PrepareSnapshot(ctx context.Context) error {
@ -317,27 +317,33 @@ func (c *Client) PrepareSnapshot(ctx context.Context) error {
return nil return nil
} }
// PostInit calls envd's POST /init endpoint, which triggers a re-read of // PostInit calls envd's POST /init endpoint to trigger post-boot or
// Firecracker MMDS metadata. This updates WRENN_SANDBOX_ID, WRENN_TEMPLATE_ID // post-restore initialization. sandbox_id and template_id are passed
// env vars and the corresponding files under /run/wrenn/ inside the guest. // so envd can set WRENN_SANDBOX_ID and WRENN_TEMPLATE_ID env vars.
// Must be called after snapshot restore so envd picks up the new sandbox's metadata.
func (c *Client) PostInit(ctx context.Context) error { func (c *Client) PostInit(ctx context.Context) error {
return c.PostInitWithDefaults(ctx, "", nil) return c.PostInitWithDefaults(ctx, "", nil, "", "")
} }
// PostInitWithDefaults calls envd's POST /init endpoint with optional default // PostInitWithDefaults calls envd's POST /init endpoint with optional default
// user and environment variables. These are applied to envd's defaults so all // user, environment variables, and sandbox metadata. These are applied to
// subsequent process executions use them. // envd's defaults so all subsequent process executions use them.
func (c *Client) PostInitWithDefaults(ctx context.Context, defaultUser string, envVars map[string]string) error { func (c *Client) PostInitWithDefaults(ctx context.Context, defaultUser string, envVars map[string]string, sandboxID, templateID string) error {
payload := make(map[string]any)
if defaultUser != "" {
payload["defaultUser"] = defaultUser
}
if len(envVars) > 0 {
payload["envVars"] = envVars
}
if sandboxID != "" {
payload["sandbox_id"] = sandboxID
}
if templateID != "" {
payload["template_id"] = templateID
}
var body io.Reader var body io.Reader
if defaultUser != "" || len(envVars) > 0 { if len(payload) > 0 {
payload := make(map[string]any)
if defaultUser != "" {
payload["defaultUser"] = defaultUser
}
if len(envVars) > 0 {
payload["envVars"] = envVars
}
data, err := json.Marshal(payload) data, err := json.Marshal(payload)
if err != nil { if err != nil {
return fmt.Errorf("marshal init body: %w", err) return fmt.Errorf("marshal init body: %w", err)

View File

@ -46,7 +46,7 @@ func SandboxesDir(wrennDir string) string {
return filepath.Join(wrennDir, "sandboxes") return filepath.Join(wrennDir, "sandboxes")
} }
// KernelPath returns the path to the Firecracker kernel. // KernelPath returns the path to the VM kernel.
func KernelPath(wrennDir string) string { func KernelPath(wrennDir string) string {
return filepath.Join(wrennDir, "kernels", "vmlinux") return filepath.Join(wrennDir, "kernels", "vmlinux")
} }

View File

@ -176,7 +176,7 @@ func NewSlot(index int) *Slot {
// CreateNetwork sets up the full network topology for a sandbox: // CreateNetwork sets up the full network topology for a sandbox:
// - Named network namespace // - Named network namespace
// - Veth pair bridging host and namespace // - Veth pair bridging host and namespace
// - TAP device inside namespace for Firecracker // - TAP device inside namespace for Cloud Hypervisor
// - Routes and NAT rules for connectivity // - Routes and NAT rules for connectivity
// //
// On error, all partially created resources are rolled back. // On error, all partially created resources are rolled back.

View File

@ -0,0 +1,28 @@
package sandbox
import (
"fmt"
"os/exec"
"strings"
)
// DetectCHVersion runs the cloud-hypervisor binary with --version and
// parses the semver from the output (e.g. "cloud-hypervisor v43.0" → "43.0").
func DetectCHVersion(binaryPath string) (string, error) {
out, err := exec.Command(binaryPath, "--version").Output()
if err != nil {
return "", fmt.Errorf("run %s --version: %w", binaryPath, err)
}
line := strings.TrimSpace(string(out))
for field := range strings.FieldsSeq(line) {
v := strings.TrimPrefix(field, "v")
if v != field || strings.Contains(field, ".") {
if strings.Count(v, ".") >= 1 {
return v, nil
}
}
}
return "", fmt.Errorf("could not parse version from cloud-hypervisor output: %q", line)
}

View File

@ -1,30 +0,0 @@
package sandbox
import (
"fmt"
"os/exec"
"strings"
)
// DetectFirecrackerVersion runs the firecracker binary with --version and
// parses the semver from the output (e.g. "Firecracker v1.14.1" → "1.14.1").
func DetectFirecrackerVersion(binaryPath string) (string, error) {
out, err := exec.Command(binaryPath, "--version").Output()
if err != nil {
return "", fmt.Errorf("run %s --version: %w", binaryPath, err)
}
// Output is typically "Firecracker v1.14.1\n" or similar.
line := strings.TrimSpace(string(out))
for _, field := range strings.Fields(line) {
v := strings.TrimPrefix(field, "v")
if v != field || strings.Contains(field, ".") {
// Either had a "v" prefix or contains a dot — likely the version.
if strings.Count(v, ".") >= 1 {
return v, nil
}
}
}
return "", fmt.Errorf("could not parse version from firecracker output: %q", line)
}

File diff suppressed because it is too large Load Diff

View File

@ -1,9 +1,11 @@
package sandbox package sandbox
import ( import (
"context"
"encoding/json" "encoding/json"
"fmt" "fmt"
"io" "io"
"net/http"
"os" "os"
"strconv" "strconv"
"strings" "strings"
@ -50,10 +52,15 @@ func readCPUStat(pid int) (cpuStat, error) {
// readEnvdMemUsed fetches mem_used from envd's /metrics endpoint. Returns // readEnvdMemUsed fetches mem_used from envd's /metrics endpoint. Returns
// guest-side total - MemAvailable (actual process memory, excluding reclaimable // guest-side total - MemAvailable (actual process memory, excluding reclaimable
// page cache). VmRSS of the Firecracker process includes guest page cache and // page cache). VmRSS of the VMM process includes guest page cache and
// never decreases, so this is the accurate metric for dashboard display. // never decreases, so this is the accurate metric for dashboard display.
func readEnvdMemUsed(client *envdclient.Client) (int64, error) { func readEnvdMemUsed(ctx context.Context, client *envdclient.Client) (int64, error) {
resp, err := client.HTTPClient().Get(client.BaseURL() + "/metrics") req, err := http.NewRequestWithContext(ctx, http.MethodGet, client.BaseURL()+"/metrics", nil)
if err != nil {
return 0, fmt.Errorf("build metrics request: %w", err)
}
resp, err := client.HTTPClient().Do(req)
if err != nil { if err != nil {
return 0, fmt.Errorf("fetch envd metrics: %w", err) return 0, fmt.Errorf("fetch envd metrics: %w", err)
} }

View File

@ -1,221 +0,0 @@
// SPDX-License-Identifier: Apache-2.0
// Modifications by M/S Omukk
// Package snapshot implements snapshot storage, header-based memory mapping,
// and memory file processing for Firecracker VM snapshots.
//
// The header system implements a generational copy-on-write memory mapping.
// Each snapshot generation stores only the blocks that changed since the
// previous generation. A Header contains a sorted list of BuildMap entries
// that together cover the entire memory address space, with each entry
// pointing to a specific generation's diff file.
package snapshot
import (
"bytes"
"context"
"encoding/binary"
"errors"
"fmt"
"io"
"github.com/google/uuid"
)
const metadataVersion = 1
// Metadata is the fixed-size header prefix describing the snapshot memory layout.
// Binary layout (little-endian, 64 bytes total):
//
// Version uint64 (8 bytes)
// BlockSize uint64 (8 bytes)
// Size uint64 (8 bytes) — total memory size in bytes
// Generation uint64 (8 bytes)
// BuildID [16]byte (UUID)
// BaseBuildID [16]byte (UUID)
type Metadata struct {
Version uint64
BlockSize uint64
Size uint64
Generation uint64
BuildID uuid.UUID
BaseBuildID uuid.UUID
}
// NewMetadata creates metadata for a first-generation snapshot.
func NewMetadata(buildID uuid.UUID, blockSize, size uint64) *Metadata {
return &Metadata{
Version: metadataVersion,
Generation: 0,
BlockSize: blockSize,
Size: size,
BuildID: buildID,
BaseBuildID: buildID,
}
}
// NextGeneration creates metadata for the next generation in the chain.
func (m *Metadata) NextGeneration(buildID uuid.UUID) *Metadata {
return &Metadata{
Version: m.Version,
Generation: m.Generation + 1,
BlockSize: m.BlockSize,
Size: m.Size,
BuildID: buildID,
BaseBuildID: m.BaseBuildID,
}
}
// BuildMap maps a contiguous range of the memory address space to a specific
// generation's diff file. Binary layout (little-endian, 40 bytes):
//
// Offset uint64 — byte offset in the virtual address space
// Length uint64 — byte count (multiple of BlockSize)
// BuildID [16]byte — which generation's diff file, uuid.Nil = zero-fill
// BuildStorageOffset uint64 — byte offset within that generation's diff file
type BuildMap struct {
Offset uint64
Length uint64
BuildID uuid.UUID
BuildStorageOffset uint64
}
// Header is the in-memory representation of a snapshot's memory mapping.
// It provides O(log N) lookup from any memory offset to the correct
// generation's diff file and offset within it.
type Header struct {
Metadata *Metadata
Mapping []*BuildMap
// blockStarts tracks which block indices start a new BuildMap entry.
// startMap provides direct access from block index to the BuildMap.
blockStarts []bool
startMap map[int64]*BuildMap
}
// NewHeader creates a Header from metadata and mapping entries.
// If mapping is nil/empty, a single entry covering the full size is created.
func NewHeader(metadata *Metadata, mapping []*BuildMap) (*Header, error) {
if metadata.BlockSize == 0 {
return nil, fmt.Errorf("block size cannot be zero")
}
if len(mapping) == 0 {
mapping = []*BuildMap{{
Offset: 0,
Length: metadata.Size,
BuildID: metadata.BuildID,
BuildStorageOffset: 0,
}}
}
blocks := TotalBlocks(int64(metadata.Size), int64(metadata.BlockSize))
starts := make([]bool, blocks)
startMap := make(map[int64]*BuildMap, len(mapping))
for _, m := range mapping {
idx := BlockIdx(int64(m.Offset), int64(metadata.BlockSize))
if idx >= 0 && idx < blocks {
starts[idx] = true
startMap[idx] = m
}
}
return &Header{
Metadata: metadata,
Mapping: mapping,
blockStarts: starts,
startMap: startMap,
}, nil
}
// GetShiftedMapping resolves a memory offset to the corresponding diff file
// offset, remaining length, and build ID. This is the hot path called for
// every UFFD page fault.
func (h *Header) GetShiftedMapping(_ context.Context, offset int64) (mappedOffset int64, mappedLength int64, buildID *uuid.UUID, err error) {
if offset < 0 || offset >= int64(h.Metadata.Size) {
return 0, 0, nil, fmt.Errorf("offset %d out of bounds (size: %d)", offset, h.Metadata.Size)
}
blockSize := int64(h.Metadata.BlockSize)
block := BlockIdx(offset, blockSize)
// Walk backwards to find the BuildMap that contains this block.
start := block
for start >= 0 {
if h.blockStarts[start] {
break
}
start--
}
if start < 0 {
return 0, 0, nil, fmt.Errorf("no mapping found for offset %d", offset)
}
m, ok := h.startMap[start]
if !ok {
return 0, 0, nil, fmt.Errorf("no mapping at block %d", start)
}
shift := (block - start) * blockSize
if shift >= int64(m.Length) {
return 0, 0, nil, fmt.Errorf("offset %d beyond mapping end (mapping offset=%d, length=%d)", offset, m.Offset, m.Length)
}
return int64(m.BuildStorageOffset) + shift, int64(m.Length) - shift, &m.BuildID, nil
}
// Serialize writes metadata + mapping entries to binary (little-endian).
func Serialize(metadata *Metadata, mappings []*BuildMap) ([]byte, error) {
var buf bytes.Buffer
if err := binary.Write(&buf, binary.LittleEndian, metadata); err != nil {
return nil, fmt.Errorf("write metadata: %w", err)
}
for _, m := range mappings {
if err := binary.Write(&buf, binary.LittleEndian, m); err != nil {
return nil, fmt.Errorf("write mapping: %w", err)
}
}
return buf.Bytes(), nil
}
// Deserialize reads a header from binary data.
func Deserialize(data []byte) (*Header, error) {
reader := bytes.NewReader(data)
var metadata Metadata
if err := binary.Read(reader, binary.LittleEndian, &metadata); err != nil {
return nil, fmt.Errorf("read metadata: %w", err)
}
var mappings []*BuildMap
for {
var m BuildMap
if err := binary.Read(reader, binary.LittleEndian, &m); err != nil {
if errors.Is(err, io.EOF) {
break
}
return nil, fmt.Errorf("read mapping: %w", err)
}
mappings = append(mappings, &m)
}
return NewHeader(&metadata, mappings)
}
// Block index helpers.
func TotalBlocks(size, blockSize int64) int64 {
return (size + blockSize - 1) / blockSize
}
func BlockIdx(offset, blockSize int64) int64 {
return offset / blockSize
}
func BlockOffset(idx, blockSize int64) int64 {
return idx * blockSize
}

View File

@ -7,14 +7,15 @@ import (
"os" "os"
"path/filepath" "path/filepath"
"syscall" "syscall"
"github.com/google/uuid"
) )
const ( const (
SnapFileName = "snapfile" // Cloud Hypervisor snapshot files.
MemDiffName = "memfile" CHConfigFile = "config.json"
MemHeaderName = "memfile.header" CHMemRangesFile = "memory-ranges"
CHStateFile = "state.json"
// Rootfs files.
RootfsFileName = "rootfs.ext4" RootfsFileName = "rootfs.ext4"
RootfsCowName = "rootfs.cow" RootfsCowName = "rootfs.cow"
RootfsMetaName = "rootfs.meta" RootfsMetaName = "rootfs.meta"
@ -25,27 +26,6 @@ func DirPath(baseDir, name string) string {
return filepath.Join(baseDir, name) return filepath.Join(baseDir, name)
} }
// SnapPath returns the path to the VM state snapshot file.
func SnapPath(baseDir, name string) string {
return filepath.Join(DirPath(baseDir, name), SnapFileName)
}
// MemDiffPath returns the path to the compact memory diff file (legacy single-generation).
func MemDiffPath(baseDir, name string) string {
return filepath.Join(DirPath(baseDir, name), MemDiffName)
}
// MemDiffPathForBuild returns the path to a specific generation's diff file.
// Format: memfile.{buildID}
func MemDiffPathForBuild(baseDir, name string, buildID uuid.UUID) string {
return filepath.Join(DirPath(baseDir, name), fmt.Sprintf("memfile.%s", buildID.String()))
}
// MemHeaderPath returns the path to the memory mapping header file.
func MemHeaderPath(baseDir, name string) string {
return filepath.Join(DirPath(baseDir, name), MemHeaderName)
}
// RootfsPath returns the path to the rootfs image. // RootfsPath returns the path to the rootfs image.
func RootfsPath(baseDir, name string) string { func RootfsPath(baseDir, name string) string {
return filepath.Join(DirPath(baseDir, name), RootfsFileName) return filepath.Join(DirPath(baseDir, name), RootfsFileName)
@ -61,10 +41,13 @@ func MetaPath(baseDir, name string) string {
return filepath.Join(DirPath(baseDir, name), RootfsMetaName) return filepath.Join(DirPath(baseDir, name), RootfsMetaName)
} }
// RootfsMeta records which base template a CoW file was created against. // RootfsMeta records which base template a CoW file was created against
// and the VM resource config needed to restart the sampler on resume.
type RootfsMeta struct { type RootfsMeta struct {
BaseTemplate string `json:"base_template"` BaseTemplate string `json:"base_template"`
TemplateID string `json:"template_id,omitempty"` TemplateID string `json:"template_id,omitempty"`
VCPUs int `json:"vcpus,omitempty"`
MemoryMB int `json:"memory_mb,omitempty"`
} }
// WriteMeta writes rootfs metadata to the snapshot directory. // WriteMeta writes rootfs metadata to the snapshot directory.
@ -92,102 +75,6 @@ func ReadMeta(baseDir, name string) (*RootfsMeta, error) {
return &meta, nil return &meta, nil
} }
// Exists reports whether a complete snapshot exists (all required files present).
// Supports both legacy (rootfs.ext4) and CoW-based (rootfs.cow + rootfs.meta) snapshots.
// Memory diff files can be either legacy "memfile" or generation-specific "memfile.{uuid}".
func Exists(baseDir, name string) bool {
dir := DirPath(baseDir, name)
// snapfile and header are always required.
for _, f := range []string{SnapFileName, MemHeaderName} {
if _, err := os.Stat(filepath.Join(dir, f)); err != nil {
return false
}
}
// Check that at least one memfile exists (legacy or generation-specific).
// We verify by reading the header and checking that referenced diff files exist.
// Fall back to checking for the legacy memfile name if header can't be read.
if _, err := os.Stat(filepath.Join(dir, MemDiffName)); err != nil {
// No legacy memfile — check if any memfile.{uuid} exists by
// looking for files matching the pattern.
matches, _ := filepath.Glob(filepath.Join(dir, "memfile.*"))
hasGenDiff := false
for _, m := range matches {
base := filepath.Base(m)
if base != MemHeaderName {
hasGenDiff = true
break
}
}
if !hasGenDiff {
return false
}
}
// Accept either rootfs.ext4 (legacy/template) or rootfs.cow + rootfs.meta (dm-snapshot).
if _, err := os.Stat(filepath.Join(dir, RootfsFileName)); err == nil {
return true
}
if _, err := os.Stat(filepath.Join(dir, RootfsCowName)); err == nil {
if _, err := os.Stat(filepath.Join(dir, RootfsMetaName)); err == nil {
return true
}
}
return false
}
// IsTemplate reports whether a template image directory exists (has rootfs.ext4).
func IsTemplate(baseDir, name string) bool {
_, err := os.Stat(filepath.Join(DirPath(baseDir, name), RootfsFileName))
return err == nil
}
// IsSnapshot reports whether a directory is a snapshot (has all snapshot files).
func IsSnapshot(baseDir, name string) bool {
return Exists(baseDir, name)
}
// HasCow reports whether a snapshot uses CoW format (rootfs.cow + rootfs.meta)
// as opposed to legacy full rootfs (rootfs.ext4).
func HasCow(baseDir, name string) bool {
dir := DirPath(baseDir, name)
_, cowErr := os.Stat(filepath.Join(dir, RootfsCowName))
_, metaErr := os.Stat(filepath.Join(dir, RootfsMetaName))
return cowErr == nil && metaErr == nil
}
// ListDiffFiles returns a map of build ID → file path for all memory diff files
// referenced by the given header. Handles both the legacy "memfile" name
// (single-generation) and generation-specific "memfile.{uuid}" names.
func ListDiffFiles(baseDir, name string, header *Header) (map[string]string, error) {
dir := DirPath(baseDir, name)
result := make(map[string]string)
for _, m := range header.Mapping {
if m.BuildID == uuid.Nil {
continue // zero-fill, no file needed
}
idStr := m.BuildID.String()
if _, exists := result[idStr]; exists {
continue
}
// Try generation-specific path first, fall back to legacy.
genPath := filepath.Join(dir, fmt.Sprintf("memfile.%s", idStr))
if _, err := os.Stat(genPath); err == nil {
result[idStr] = genPath
continue
}
legacyPath := filepath.Join(dir, MemDiffName)
if _, err := os.Stat(legacyPath); err == nil {
result[idStr] = legacyPath
continue
}
return nil, fmt.Errorf("diff file not found for build %s", idStr)
}
return result, nil
}
// EnsureDir creates the snapshot directory if it doesn't exist. // EnsureDir creates the snapshot directory if it doesn't exist.
func EnsureDir(baseDir, name string) error { func EnsureDir(baseDir, name string) error {
dir := DirPath(baseDir, name) dir := DirPath(baseDir, name)

View File

@ -1,214 +0,0 @@
// SPDX-License-Identifier: Apache-2.0
// Modifications by M/S Omukk
package snapshot
import "github.com/google/uuid"
// CreateMapping converts a dirty-block bitset (represented as a []bool) into
// a sorted list of BuildMap entries. Consecutive dirty blocks are merged into
// a single entry. BuildStorageOffset tracks the sequential position in the
// compact diff file.
func CreateMapping(buildID uuid.UUID, dirty []bool, blockSize int64) []*BuildMap {
var mappings []*BuildMap
var runStart int64 = -1
var runLength int64
var storageOffset uint64
for i, set := range dirty {
if !set {
if runLength > 0 {
mappings = append(mappings, &BuildMap{
Offset: uint64(runStart) * uint64(blockSize),
Length: uint64(runLength) * uint64(blockSize),
BuildID: buildID,
BuildStorageOffset: storageOffset,
})
storageOffset += uint64(runLength) * uint64(blockSize)
runLength = 0
}
runStart = -1
continue
}
if runStart < 0 {
runStart = int64(i)
runLength = 1
} else {
runLength++
}
}
if runLength > 0 {
mappings = append(mappings, &BuildMap{
Offset: uint64(runStart) * uint64(blockSize),
Length: uint64(runLength) * uint64(blockSize),
BuildID: buildID,
BuildStorageOffset: storageOffset,
})
}
return mappings
}
// MergeMappings overlays diffMapping on top of baseMapping. Where they overlap,
// diff takes priority. The result covers the entire address space.
//
// Both inputs must be sorted by Offset. The base mapping should cover the full size.
//
// Inspired by e2b's snapshot system (Apache 2.0, modified by Omukk).
func MergeMappings(baseMapping, diffMapping []*BuildMap) []*BuildMap {
if len(diffMapping) == 0 {
return baseMapping
}
// Work on a copy of baseMapping to avoid mutating the original.
baseCopy := make([]*BuildMap, len(baseMapping))
for i, m := range baseMapping {
cp := *m
baseCopy[i] = &cp
}
var result []*BuildMap
var bi, di int
for bi < len(baseCopy) && di < len(diffMapping) {
base := baseCopy[bi]
diff := diffMapping[di]
if base.Length == 0 {
bi++
continue
}
if diff.Length == 0 {
di++
continue
}
// No overlap: base entirely before diff.
if base.Offset+base.Length <= diff.Offset {
result = append(result, base)
bi++
continue
}
// No overlap: diff entirely before base.
if diff.Offset+diff.Length <= base.Offset {
result = append(result, diff)
di++
continue
}
// Base fully inside diff — skip base.
if base.Offset >= diff.Offset && base.Offset+base.Length <= diff.Offset+diff.Length {
bi++
continue
}
// Diff fully inside base — split base around diff.
if diff.Offset >= base.Offset && diff.Offset+diff.Length <= base.Offset+base.Length {
leftLen := int64(diff.Offset) - int64(base.Offset)
if leftLen > 0 {
result = append(result, &BuildMap{
Offset: base.Offset,
Length: uint64(leftLen),
BuildID: base.BuildID,
BuildStorageOffset: base.BuildStorageOffset,
})
}
result = append(result, diff)
di++
rightShift := int64(diff.Offset) + int64(diff.Length) - int64(base.Offset)
rightLen := int64(base.Length) - rightShift
if rightLen > 0 {
baseCopy[bi] = &BuildMap{
Offset: base.Offset + uint64(rightShift),
Length: uint64(rightLen),
BuildID: base.BuildID,
BuildStorageOffset: base.BuildStorageOffset + uint64(rightShift),
}
} else {
bi++
}
continue
}
// Base starts after diff with overlap — emit diff, trim base.
if base.Offset > diff.Offset {
result = append(result, diff)
di++
rightShift := int64(diff.Offset) + int64(diff.Length) - int64(base.Offset)
rightLen := int64(base.Length) - rightShift
if rightLen > 0 {
baseCopy[bi] = &BuildMap{
Offset: base.Offset + uint64(rightShift),
Length: uint64(rightLen),
BuildID: base.BuildID,
BuildStorageOffset: base.BuildStorageOffset + uint64(rightShift),
}
} else {
bi++
}
continue
}
// Diff starts after base with overlap — emit left part of base.
if diff.Offset > base.Offset {
leftLen := int64(diff.Offset) - int64(base.Offset)
if leftLen > 0 {
result = append(result, &BuildMap{
Offset: base.Offset,
Length: uint64(leftLen),
BuildID: base.BuildID,
BuildStorageOffset: base.BuildStorageOffset,
})
}
bi++
continue
}
}
// Append remaining entries.
result = append(result, baseCopy[bi:]...)
result = append(result, diffMapping[di:]...)
return result
}
// NormalizeMappings merges adjacent entries with the same BuildID.
func NormalizeMappings(mappings []*BuildMap) []*BuildMap {
if len(mappings) == 0 {
return nil
}
result := make([]*BuildMap, 0, len(mappings))
current := &BuildMap{
Offset: mappings[0].Offset,
Length: mappings[0].Length,
BuildID: mappings[0].BuildID,
BuildStorageOffset: mappings[0].BuildStorageOffset,
}
for i := 1; i < len(mappings); i++ {
m := mappings[i]
if m.BuildID == current.BuildID {
current.Length += m.Length
} else {
result = append(result, current)
current = &BuildMap{
Offset: m.Offset,
Length: m.Length,
BuildID: m.BuildID,
BuildStorageOffset: m.BuildStorageOffset,
}
}
}
result = append(result, current)
return result
}

View File

@ -1,285 +0,0 @@
// SPDX-License-Identifier: Apache-2.0
// Modifications by M/S Omukk
package snapshot
import (
"context"
"fmt"
"io"
"os"
"github.com/google/uuid"
)
const (
// DefaultBlockSize is 4KB — standard page size for Firecracker.
DefaultBlockSize int64 = 4096
)
// ProcessMemfile reads a full memory file produced by Firecracker's
// PUT /snapshot/create, identifies non-zero blocks, and writes only those
// blocks to a compact diff file. Returns the Header describing the mapping.
//
// The output diff file contains non-zero blocks written sequentially.
// The header maps each block in the full address space to either:
// - A position in the diff file (for non-zero blocks)
// - uuid.Nil (for zero/empty blocks, served as zeros without I/O)
//
// buildID identifies this snapshot generation in the header chain.
func ProcessMemfile(memfilePath, diffPath, headerPath string, buildID uuid.UUID) (*Header, error) {
src, err := os.Open(memfilePath)
if err != nil {
return nil, fmt.Errorf("open memfile: %w", err)
}
defer src.Close()
info, err := src.Stat()
if err != nil {
return nil, fmt.Errorf("stat memfile: %w", err)
}
memSize := info.Size()
dst, err := os.Create(diffPath)
if err != nil {
return nil, fmt.Errorf("create diff file: %w", err)
}
defer dst.Close()
totalBlocks := TotalBlocks(memSize, DefaultBlockSize)
dirty := make([]bool, totalBlocks)
empty := make([]bool, totalBlocks)
buf := make([]byte, DefaultBlockSize)
for i := int64(0); i < totalBlocks; i++ {
n, err := io.ReadFull(src, buf)
if err != nil && err != io.ErrUnexpectedEOF {
return nil, fmt.Errorf("read block %d: %w", i, err)
}
// Zero-pad the last block if it's short.
if int64(n) < DefaultBlockSize {
for j := n; j < int(DefaultBlockSize); j++ {
buf[j] = 0
}
}
if isZeroBlock(buf) {
empty[i] = true
continue
}
dirty[i] = true
if _, err := dst.Write(buf); err != nil {
return nil, fmt.Errorf("write diff block %d: %w", i, err)
}
}
// Build header.
dirtyMappings := CreateMapping(buildID, dirty, DefaultBlockSize)
emptyMappings := CreateMapping(uuid.Nil, empty, DefaultBlockSize)
merged := MergeMappings(dirtyMappings, emptyMappings)
normalized := NormalizeMappings(merged)
metadata := NewMetadata(buildID, uint64(DefaultBlockSize), uint64(memSize))
header, err := NewHeader(metadata, normalized)
if err != nil {
return nil, fmt.Errorf("create header: %w", err)
}
// Write header to disk.
headerData, err := Serialize(metadata, normalized)
if err != nil {
return nil, fmt.Errorf("serialize header: %w", err)
}
if err := os.WriteFile(headerPath, headerData, 0644); err != nil {
return nil, fmt.Errorf("write header: %w", err)
}
return header, nil
}
// ProcessMemfileWithParent processes a memory file as a new generation on top
// of an existing parent header. The new diff file contains only blocks that
// differ from what the parent header maps. This is used for re-pause of a
// sandbox that was restored from a snapshot.
func ProcessMemfileWithParent(memfilePath, diffPath, headerPath string, parentHeader *Header, buildID uuid.UUID) (*Header, error) {
src, err := os.Open(memfilePath)
if err != nil {
return nil, fmt.Errorf("open memfile: %w", err)
}
defer src.Close()
info, err := src.Stat()
if err != nil {
return nil, fmt.Errorf("stat memfile: %w", err)
}
memSize := info.Size()
dst, err := os.Create(diffPath)
if err != nil {
return nil, fmt.Errorf("create diff file: %w", err)
}
defer dst.Close()
totalBlocks := TotalBlocks(memSize, DefaultBlockSize)
dirty := make([]bool, totalBlocks)
buf := make([]byte, DefaultBlockSize)
for i := int64(0); i < totalBlocks; i++ {
n, err := io.ReadFull(src, buf)
if err != nil && err != io.ErrUnexpectedEOF {
return nil, fmt.Errorf("read block %d: %w", i, err)
}
if int64(n) < DefaultBlockSize {
for j := n; j < int(DefaultBlockSize); j++ {
buf[j] = 0
}
}
if isZeroBlock(buf) {
// For a diff memfile, zero blocks mean "not dirtied since resume" —
// they should inherit the parent's mapping, not be zero-filled.
continue
}
dirty[i] = true
if _, err := dst.Write(buf); err != nil {
return nil, fmt.Errorf("write diff block %d: %w", i, err)
}
}
// Only dirty blocks go into the diff overlay; MergeMappings preserves the
// parent's mapping for everything else.
dirtyMappings := CreateMapping(buildID, dirty, DefaultBlockSize)
merged := MergeMappings(parentHeader.Mapping, dirtyMappings)
normalized := NormalizeMappings(merged)
metadata := parentHeader.Metadata.NextGeneration(buildID)
header, err := NewHeader(metadata, normalized)
if err != nil {
return nil, fmt.Errorf("create header: %w", err)
}
headerData, err := Serialize(metadata, normalized)
if err != nil {
return nil, fmt.Errorf("serialize header: %w", err)
}
if err := os.WriteFile(headerPath, headerData, 0644); err != nil {
return nil, fmt.Errorf("write header: %w", err)
}
return header, nil
}
// MergeDiffs consolidates multiple generation diff files into a single diff
// file and resets the generation counter to 0. This is a pure file-level
// operation — no Firecracker involvement.
//
// It reads each non-nil block from the appropriate diff file (as mapped by
// the header), writes them all sequentially into a single new diff file,
// and produces a fresh header pointing only at that file.
//
// diffFiles maps build ID (string) → open file path for each generation's diff.
func MergeDiffs(header *Header, diffFiles map[string]string, mergedDiffPath, headerPath string) (*Header, error) {
blockSize := int64(header.Metadata.BlockSize)
mergedBuildID := uuid.New()
// Open all source diff files.
sources := make(map[string]*os.File, len(diffFiles))
for id, path := range diffFiles {
f, err := os.Open(path)
if err != nil {
// Close already opened files.
for _, sf := range sources {
sf.Close()
}
return nil, fmt.Errorf("open diff file for build %s: %w", id, err)
}
sources[id] = f
}
defer func() {
for _, f := range sources {
f.Close()
}
}()
dst, err := os.Create(mergedDiffPath)
if err != nil {
return nil, fmt.Errorf("create merged diff file: %w", err)
}
defer dst.Close()
totalBlocks := TotalBlocks(int64(header.Metadata.Size), blockSize)
dirty := make([]bool, totalBlocks)
empty := make([]bool, totalBlocks)
buf := make([]byte, blockSize)
for i := int64(0); i < totalBlocks; i++ {
offset := i * blockSize
mappedOffset, _, buildID, err := header.GetShiftedMapping(context.Background(), offset)
if err != nil {
return nil, fmt.Errorf("lookup block %d: %w", i, err)
}
if *buildID == uuid.Nil {
empty[i] = true
continue
}
src, ok := sources[buildID.String()]
if !ok {
return nil, fmt.Errorf("no diff file for build %s (block %d)", buildID, i)
}
if _, err := src.ReadAt(buf, mappedOffset); err != nil {
return nil, fmt.Errorf("read block %d from build %s: %w", i, buildID, err)
}
dirty[i] = true
if _, err := dst.Write(buf); err != nil {
return nil, fmt.Errorf("write merged block %d: %w", i, err)
}
}
// Build fresh header with generation 0.
dirtyMappings := CreateMapping(mergedBuildID, dirty, blockSize)
emptyMappings := CreateMapping(uuid.Nil, empty, blockSize)
merged := MergeMappings(dirtyMappings, emptyMappings)
normalized := NormalizeMappings(merged)
metadata := NewMetadata(mergedBuildID, uint64(blockSize), header.Metadata.Size)
newHeader, err := NewHeader(metadata, normalized)
if err != nil {
return nil, fmt.Errorf("create merged header: %w", err)
}
headerData, err := Serialize(metadata, normalized)
if err != nil {
return nil, fmt.Errorf("serialize merged header: %w", err)
}
if err := os.WriteFile(headerPath, headerData, 0644); err != nil {
return nil, fmt.Errorf("write merged header: %w", err)
}
return newHeader, nil
}
// isZeroBlock checks if a block is entirely zero bytes.
func isZeroBlock(block []byte) bool {
// Fast path: compare 8 bytes at a time.
for i := 0; i+8 <= len(block); i += 8 {
if block[i] != 0 || block[i+1] != 0 || block[i+2] != 0 || block[i+3] != 0 ||
block[i+4] != 0 || block[i+5] != 0 || block[i+6] != 0 || block[i+7] != 0 {
return false
}
}
// Tail bytes.
for i := len(block) &^ 7; i < len(block); i++ {
if block[i] != 0 {
return false
}
}
return true
}

View File

@ -1,92 +0,0 @@
// SPDX-License-Identifier: Apache-2.0
// Modifications by M/S Omukk
// Package uffd implements a userfaultfd-based memory server for Firecracker
// snapshot restore. When a VM is restored from a snapshot, instead of loading
// the entire memory file upfront, the UFFD handler intercepts page faults
// and serves memory pages on demand from the snapshot's compact diff file.
package uffd
/*
#include <sys/syscall.h>
#include <fcntl.h>
#include <linux/userfaultfd.h>
#include <sys/ioctl.h>
struct uffd_pagefault {
__u64 flags;
__u64 address;
__u32 ptid;
};
*/
import "C"
import (
"fmt"
"syscall"
"unsafe"
)
const (
UFFD_EVENT_PAGEFAULT = C.UFFD_EVENT_PAGEFAULT
UFFD_EVENT_FORK = C.UFFD_EVENT_FORK
UFFD_EVENT_REMAP = C.UFFD_EVENT_REMAP
UFFD_EVENT_REMOVE = C.UFFD_EVENT_REMOVE
UFFD_EVENT_UNMAP = C.UFFD_EVENT_UNMAP
UFFD_PAGEFAULT_FLAG_WRITE = C.UFFD_PAGEFAULT_FLAG_WRITE
UFFDIO_COPY = C.UFFDIO_COPY
UFFDIO_COPY_MODE_WP = C.UFFDIO_COPY_MODE_WP
)
type (
uffdMsg = C.struct_uffd_msg
uffdPagefault = C.struct_uffd_pagefault
uffdioCopy = C.struct_uffdio_copy
)
// fd wraps a userfaultfd file descriptor received from Firecracker.
type fd uintptr
// copy installs a page into guest memory at the given address using UFFDIO_COPY.
// mode controls write-protection: use UFFDIO_COPY_MODE_WP to preserve WP bit.
func (f fd) copy(addr, pagesize uintptr, data []byte, mode C.ulonglong) error {
alignedAddr := addr &^ (pagesize - 1)
cpy := uffdioCopy{
src: C.ulonglong(uintptr(unsafe.Pointer(&data[0]))),
dst: C.ulonglong(alignedAddr),
len: C.ulonglong(pagesize),
mode: mode,
copy: 0,
}
_, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(f), UFFDIO_COPY, uintptr(unsafe.Pointer(&cpy)))
if errno != 0 {
return errno
}
if cpy.copy != C.longlong(pagesize) {
return fmt.Errorf("UFFDIO_COPY copied %d bytes, expected %d", cpy.copy, pagesize)
}
return nil
}
// close closes the userfaultfd file descriptor.
func (f fd) close() error {
return syscall.Close(int(f))
}
// getMsgEvent extracts the event type from a uffd_msg.
func getMsgEvent(msg *uffdMsg) C.uchar {
return msg.event
}
// getMsgArg extracts the arg union from a uffd_msg.
func getMsgArg(msg *uffdMsg) [24]byte {
return msg.arg
}
// getPagefaultAddress extracts the faulting address from a uffd_pagefault.
func getPagefaultAddress(pf *uffdPagefault) uintptr {
return uintptr(pf.address)
}

View File

@ -1,41 +0,0 @@
// SPDX-License-Identifier: Apache-2.0
// Modifications by M/S Omukk
//
// Modifications by Omukk (Wrenn Sandbox): merged Region and Mapping into
// single file, inlined shiftedOffset helper.
package uffd
import "fmt"
// Region is a mapping of guest memory to host virtual address space.
// Firecracker sends these as JSON when connecting to the UFFD socket.
// The JSON field names match Firecracker's UFFD protocol.
type Region struct {
BaseHostVirtAddr uintptr `json:"base_host_virt_addr"`
Size uintptr `json:"size"`
Offset uintptr `json:"offset"`
PageSize uintptr `json:"page_size_kib"` // Actually in bytes despite the name.
}
// Mapping translates between host virtual addresses and logical memory offsets.
type Mapping struct {
Regions []Region
}
// NewMapping creates a Mapping from a list of regions.
func NewMapping(regions []Region) *Mapping {
return &Mapping{Regions: regions}
}
// GetOffset converts a host virtual address to a logical memory file offset
// and returns the page size. This is called on every UFFD page fault.
func (m *Mapping) GetOffset(hostVirtAddr uintptr) (int64, uintptr, error) {
for _, r := range m.Regions {
if hostVirtAddr >= r.BaseHostVirtAddr && hostVirtAddr < r.BaseHostVirtAddr+r.Size {
offset := int64(hostVirtAddr-r.BaseHostVirtAddr) + int64(r.Offset)
return offset, r.PageSize, nil
}
}
return 0, 0, fmt.Errorf("address %#x not found in any memory region", hostVirtAddr)
}

View File

@ -1,451 +0,0 @@
// SPDX-License-Identifier: Apache-2.0
// Modifications by M/S Omukk
//
// Modifications by Omukk (Wrenn Sandbox): replaced errgroup with WaitGroup
// + semaphore, replaced fdexit abstraction with pipe, integrated with
// snapshot.Header-based DiffFileSource instead of block.ReadonlyDevice,
// fixed EAGAIN handling in poll loop.
package uffd
/*
#include <linux/userfaultfd.h>
*/
import "C"
import (
"context"
"encoding/json"
"errors"
"fmt"
"log/slog"
"net"
"os"
"sync"
"syscall"
"unsafe"
"golang.org/x/sys/unix"
"git.omukk.dev/wrenn/wrenn/internal/snapshot"
)
const (
fdSize = 4
regionMappingsSize = 1024
maxConcurrentFaults = 4096
)
// MemorySource provides page data for the UFFD handler.
// Given a logical memory offset and a size, it returns the page data.
type MemorySource interface {
ReadPage(ctx context.Context, offset int64, size int64) ([]byte, error)
}
// Server manages the UFFD Unix socket lifecycle and page fault handling
// for a single Firecracker snapshot restore.
type Server struct {
socketPath string
source MemorySource
lis *net.UnixListener
readyCh chan struct{}
readyOnce sync.Once
doneCh chan struct{}
doneErr error
// exitPipe signals the poll loop to stop.
exitR *os.File
exitW *os.File
// Set by handle() after Firecracker connects; read by Prefetch()
// after waiting on readyCh (which establishes happens-before).
uffdFd fd
mapping *Mapping
// Prefetch lifecycle: cancel stops the goroutine, prefetchDone is
// closed when it exits. Stop() drains prefetchDone before returning
// so the caller can safely close diff file handles.
prefetchCancel context.CancelFunc
prefetchDone chan struct{}
}
// NewServer creates a UFFD server that will listen on the given socket path
// and serve memory pages from the given source.
func NewServer(socketPath string, source MemorySource) *Server {
return &Server{
socketPath: socketPath,
source: source,
readyCh: make(chan struct{}),
doneCh: make(chan struct{}),
}
}
// Start begins listening on the Unix socket. Firecracker will connect to this
// socket after loadSnapshot is called with the UFFD backend.
// Start returns immediately; the server runs in a background goroutine.
func (s *Server) Start(ctx context.Context) error {
lis, err := net.ListenUnix("unix", &net.UnixAddr{Name: s.socketPath, Net: "unix"})
if err != nil {
return fmt.Errorf("listen on uffd socket: %w", err)
}
s.lis = lis
if err := os.Chmod(s.socketPath, 0o777); err != nil {
lis.Close()
return fmt.Errorf("chmod uffd socket: %w", err)
}
// Create exit signal pipe.
r, w, err := os.Pipe()
if err != nil {
lis.Close()
return fmt.Errorf("create exit pipe: %w", err)
}
s.exitR = r
s.exitW = w
go func() {
defer close(s.doneCh)
s.doneErr = s.handle(ctx)
s.lis.Close()
s.exitR.Close()
s.exitW.Close()
s.readyOnce.Do(func() { close(s.readyCh) })
}()
return nil
}
// Ready returns a channel that is closed when the UFFD handler is ready
// (after Firecracker has connected and sent the uffd fd).
func (s *Server) Ready() <-chan struct{} {
return s.readyCh
}
// Stop signals the UFFD poll loop to exit and waits for it to finish.
// Also cancels and waits for any running prefetch goroutine.
func (s *Server) Stop() error {
if s.prefetchCancel != nil {
s.prefetchCancel()
}
// Write a byte to the exit pipe to wake the poll loop.
_, _ = s.exitW.Write([]byte{0})
<-s.doneCh
if s.prefetchDone != nil {
<-s.prefetchDone
}
return s.doneErr
}
// Wait blocks until the server exits.
func (s *Server) Wait() error {
<-s.doneCh
return s.doneErr
}
// handle accepts the Firecracker connection, receives the UFFD fd via
// SCM_RIGHTS, and runs the page fault poll loop.
func (s *Server) handle(ctx context.Context) error {
conn, err := s.lis.Accept()
if err != nil {
return fmt.Errorf("accept uffd connection: %w", err)
}
unixConn := conn.(*net.UnixConn)
defer unixConn.Close()
// Read the memory region mappings (JSON) and the UFFD fd (SCM_RIGHTS).
regionBuf := make([]byte, regionMappingsSize)
uffdBuf := make([]byte, syscall.CmsgSpace(fdSize))
nRegion, nFd, _, _, err := unixConn.ReadMsgUnix(regionBuf, uffdBuf)
if err != nil {
return fmt.Errorf("read uffd message: %w", err)
}
var regions []Region
if err := json.Unmarshal(regionBuf[:nRegion], &regions); err != nil {
return fmt.Errorf("parse memory regions: %w", err)
}
controlMsgs, err := syscall.ParseSocketControlMessage(uffdBuf[:nFd])
if err != nil {
return fmt.Errorf("parse control messages: %w", err)
}
if len(controlMsgs) != 1 {
return fmt.Errorf("expected 1 control message, got %d", len(controlMsgs))
}
fds, err := syscall.ParseUnixRights(&controlMsgs[0])
if err != nil {
return fmt.Errorf("parse unix rights: %w", err)
}
if len(fds) != 1 {
return fmt.Errorf("expected 1 fd, got %d", len(fds))
}
uffdFd := fd(fds[0])
defer uffdFd.close()
mapping := NewMapping(regions)
// Store for use by Prefetch().
s.uffdFd = uffdFd
s.mapping = mapping
slog.Info("uffd handler connected",
"regions", len(regions),
"fd", int(uffdFd),
)
// Signal readiness.
s.readyOnce.Do(func() { close(s.readyCh) })
// Run the poll loop.
return s.serve(ctx, uffdFd, mapping)
}
// serve is the main poll loop. It polls the UFFD fd for page fault events
// and the exit pipe for shutdown signals.
func (s *Server) serve(ctx context.Context, uffdFd fd, mapping *Mapping) error {
pollFds := []unix.PollFd{
{Fd: int32(uffdFd), Events: unix.POLLIN},
{Fd: int32(s.exitR.Fd()), Events: unix.POLLIN},
}
var wg sync.WaitGroup
sem := make(chan struct{}, maxConcurrentFaults)
// Always wait for in-flight goroutines before returning, so the caller
// can safely close the uffd fd after serve returns.
defer wg.Wait()
for {
if _, err := unix.Poll(pollFds, -1); err != nil {
if err == unix.EINTR || err == unix.EAGAIN {
continue
}
return fmt.Errorf("poll: %w", err)
}
// Check exit signal.
if pollFds[1].Revents&unix.POLLIN != 0 {
return nil
}
if pollFds[0].Revents&unix.POLLIN == 0 {
continue
}
// Read the uffd_msg. The fd is O_NONBLOCK (set by Firecracker),
// so EAGAIN is expected — just go back to poll.
buf := make([]byte, unsafe.Sizeof(uffdMsg{}))
n, err := readUffdMsg(uffdFd, buf)
if err == syscall.EAGAIN {
continue
}
if err != nil {
return fmt.Errorf("read uffd msg: %w", err)
}
if n == 0 {
continue
}
msg := *(*uffdMsg)(unsafe.Pointer(&buf[0]))
event := getMsgEvent(&msg)
switch event {
case UFFD_EVENT_PAGEFAULT:
// Handled below.
case UFFD_EVENT_REMOVE, UFFD_EVENT_UNMAP, UFFD_EVENT_REMAP, UFFD_EVENT_FORK:
// Non-fatal lifecycle events from the guest kernel (e.g. balloon
// deflation, mmap/munmap). No action needed — continue polling.
continue
default:
return fmt.Errorf("unexpected uffd event type: %d", event)
}
arg := getMsgArg(&msg)
pf := *(*uffdPagefault)(unsafe.Pointer(&arg[0]))
addr := getPagefaultAddress(&pf)
offset, pagesize, err := mapping.GetOffset(addr)
if err != nil {
return fmt.Errorf("resolve address %#x: %w", addr, err)
}
sem <- struct{}{}
wg.Add(1)
go func() {
defer wg.Done()
defer func() { <-sem }()
if err := s.faultPage(ctx, uffdFd, addr, offset, pagesize); err != nil {
slog.Error("uffd fault page error",
"addr", fmt.Sprintf("%#x", addr),
"offset", offset,
"error", err,
)
}
}()
}
}
// readUffdMsg reads a single uffd_msg, retrying on EINTR.
// Returns (n, EAGAIN) if the non-blocking read has nothing available.
func readUffdMsg(uffdFd fd, buf []byte) (int, error) {
for {
n, err := syscall.Read(int(uffdFd), buf)
if err == syscall.EINTR {
continue
}
return n, err
}
}
// faultPage fetches a page from the memory source and copies it into
// guest memory via UFFDIO_COPY.
func (s *Server) faultPage(ctx context.Context, uffdFd fd, addr uintptr, offset int64, pagesize uintptr) error {
data, err := s.source.ReadPage(ctx, offset, int64(pagesize))
if err != nil {
return fmt.Errorf("read page at offset %d: %w", offset, err)
}
// Mode 0: no write-protect. Standard Firecracker does not register
// UFFD ranges with WP support, so UFFDIO_COPY_MODE_WP would fail.
if err := uffdFd.copy(addr, pagesize, data, 0); err != nil {
if errors.Is(err, unix.EEXIST) {
// Page already mapped (race with prefetch or concurrent fault).
return nil
}
return fmt.Errorf("uffdio_copy: %w", err)
}
return nil
}
// Prefetch proactively loads all guest memory pages in the background.
// It iterates over every page in every UFFD region and copies it from the
// diff file into guest memory via UFFDIO_COPY. Pages already loaded by
// on-demand faults return nil from faultPage (EEXIST handled internally).
// This eliminates the per-request latency caused by lazy page faulting
// after snapshot restore.
//
// The goroutine blocks on readyCh before reading the uffd fd and mapping
// fields (establishes happens-before with handle()). It uses an internal
// context independent of the caller's RPC context so it survives after the
// create/resume RPC returns. Stop() cancels and joins the goroutine.
func (s *Server) Prefetch() {
ctx, cancel := context.WithCancel(context.Background())
s.prefetchCancel = cancel
s.prefetchDone = make(chan struct{})
go func() {
defer close(s.prefetchDone)
// Wait for Firecracker to connect and send the uffd fd.
select {
case <-s.readyCh:
case <-ctx.Done():
return
}
uffdFd := s.uffdFd
mapping := s.mapping
if mapping == nil {
return
}
var total, errored int
for _, region := range mapping.Regions {
pageSize := region.PageSize
if pageSize == 0 {
continue
}
for off := uintptr(0); off < region.Size; off += pageSize {
if ctx.Err() != nil {
slog.Debug("uffd prefetch cancelled",
"pages", total, "errors", errored)
return
}
addr := region.BaseHostVirtAddr + off
memOffset := int64(off) + int64(region.Offset)
if err := s.faultPage(ctx, uffdFd, addr, memOffset, pageSize); err != nil {
errored++
} else {
total++
}
}
}
slog.Info("uffd prefetch complete",
"pages", total, "errors", errored)
}()
}
// DiffFileSource serves pages from a snapshot's compact diff file using
// the header's block mapping to resolve offsets.
type DiffFileSource struct {
header *snapshot.Header
// diffs maps build ID → open file handle for each generation's diff file.
diffs map[string]*os.File
}
// NewDiffFileSource creates a memory source backed by snapshot diff files.
// diffs maps build ID string to the file path of each generation's diff file.
func NewDiffFileSource(header *snapshot.Header, diffPaths map[string]string) (*DiffFileSource, error) {
diffs := make(map[string]*os.File, len(diffPaths))
for id, path := range diffPaths {
f, err := os.Open(path)
if err != nil {
// Close already opened files.
for _, opened := range diffs {
opened.Close()
}
return nil, fmt.Errorf("open diff file %s: %w", path, err)
}
diffs[id] = f
}
return &DiffFileSource{header: header, diffs: diffs}, nil
}
// ReadPage resolves a memory offset through the header mapping and reads
// the corresponding page from the correct generation's diff file.
func (s *DiffFileSource) ReadPage(ctx context.Context, offset int64, size int64) ([]byte, error) {
mappedOffset, _, buildID, err := s.header.GetShiftedMapping(ctx, offset)
if err != nil {
return nil, fmt.Errorf("resolve offset %d: %w", offset, err)
}
// uuid.Nil means zero-fill (empty page).
var nilUUID [16]byte
if *buildID == nilUUID {
return make([]byte, size), nil
}
f, ok := s.diffs[buildID.String()]
if !ok {
return nil, fmt.Errorf("no diff file for build %s", buildID)
}
buf := make([]byte, size)
n, err := f.ReadAt(buf, mappedOffset)
if err != nil && int64(n) < size {
return nil, fmt.Errorf("read diff at offset %d: %w", mappedOffset, err)
}
return buf, nil
}
// Close closes all open diff file handles.
func (s *DiffFileSource) Close() error {
var errs []error
for _, f := range s.diffs {
if err := f.Close(); err != nil {
errs = append(errs, err)
}
}
return errors.Join(errs...)
}

213
internal/vm/ch.go Normal file
View File

@ -0,0 +1,213 @@
package vm
import (
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"net"
"net/http"
)
// chClient talks to the Cloud Hypervisor HTTP API over a Unix socket.
type chClient struct {
http *http.Client
socketPath string
}
func newCHClient(socketPath string) *chClient {
return &chClient{
socketPath: socketPath,
http: &http.Client{
Transport: &http.Transport{
DialContext: func(ctx context.Context, _, _ string) (net.Conn, error) {
var d net.Dialer
return d.DialContext(ctx, "unix", socketPath)
},
},
},
}
}
func (c *chClient) do(ctx context.Context, method, path string, body any) error {
var bodyReader io.Reader
if body != nil {
data, err := json.Marshal(body)
if err != nil {
return fmt.Errorf("marshal request body: %w", err)
}
bodyReader = bytes.NewReader(data)
}
req, err := http.NewRequestWithContext(ctx, method, "http://localhost"+path, bodyReader)
if err != nil {
return fmt.Errorf("create request: %w", err)
}
if body != nil {
req.Header.Set("Content-Type", "application/json")
}
resp, err := c.http.Do(req)
if err != nil {
return fmt.Errorf("%s %s: %w", method, path, err)
}
defer resp.Body.Close()
if resp.StatusCode >= 300 {
respBody, _ := io.ReadAll(resp.Body)
return fmt.Errorf("%s %s: status %d: %s", method, path, resp.StatusCode, string(respBody))
}
return nil
}
// --- CH API payload types ---
type chPayload struct {
Firmware string `json:"firmware,omitempty"`
Kernel string `json:"kernel"`
Cmdline string `json:"cmdline"`
}
type chCPUs struct {
BootVCPUs int `json:"boot_vcpus"`
MaxVCPUs int `json:"max_vcpus"`
}
type chMemory struct {
Size uint64 `json:"size"`
Shared bool `json:"shared,omitempty"`
HotplugSize uint64 `json:"hotplug_size,omitempty"`
HotplugMethod string `json:"hotplug_method,omitempty"`
}
type chDisk struct {
Path string `json:"path"`
Readonly bool `json:"readonly,omitempty"`
ImageType string `json:"image_type,omitempty"`
}
type chNet struct {
Tap string `json:"tap"`
MAC string `json:"mac"`
NumQs int `json:"num_queues,omitempty"`
QueueS int `json:"queue_size,omitempty"`
}
type chBalloon struct {
Size int64 `json:"size"`
DeflateOnOOM bool `json:"deflate_on_oom"`
FreePageRep bool `json:"free_page_reporting,omitempty"`
}
type chConsole struct {
Mode string `json:"mode"`
}
type chCreatePayload struct {
Payload chPayload `json:"payload"`
CPUs chCPUs `json:"cpus"`
Memory chMemory `json:"memory"`
Disks []chDisk `json:"disks"`
Net []chNet `json:"net"`
Balloon *chBalloon `json:"balloon,omitempty"`
Serial chConsole `json:"serial"`
Console chConsole `json:"console"`
}
// createVM sends the full VM configuration as a single payload.
func (c *chClient) createVM(ctx context.Context, cfg *VMConfig) error {
memBytes := uint64(cfg.MemoryMB) * 1024 * 1024
payload := chCreatePayload{
Payload: chPayload{
Kernel: cfg.KernelPath,
Cmdline: cfg.kernelArgs(),
},
CPUs: chCPUs{
BootVCPUs: cfg.VCPUs,
MaxVCPUs: cfg.VCPUs,
},
Memory: chMemory{
Size: memBytes,
Shared: true,
},
Disks: []chDisk{
{
Path: cfg.SandboxDir + "/rootfs.ext4",
ImageType: "Raw",
},
},
Net: []chNet{
{
Tap: cfg.TapDevice,
MAC: cfg.TapMAC,
},
},
Balloon: &chBalloon{
Size: 0,
DeflateOnOOM: true,
FreePageRep: true,
},
Serial: chConsole{
Mode: "Tty",
},
Console: chConsole{
Mode: "Off",
},
}
return c.do(ctx, http.MethodPut, "/api/v1/vm.create", payload)
}
// bootVM starts the VM after creation.
func (c *chClient) bootVM(ctx context.Context) error {
return c.do(ctx, http.MethodPut, "/api/v1/vm.boot", nil)
}
// pauseVM pauses the microVM.
func (c *chClient) pauseVM(ctx context.Context) error {
return c.do(ctx, http.MethodPut, "/api/v1/vm.pause", nil)
}
// resumeVM resumes a paused microVM.
func (c *chClient) resumeVM(ctx context.Context) error {
return c.do(ctx, http.MethodPut, "/api/v1/vm.resume", nil)
}
// snapshotVM creates a VM snapshot to the given directory.
func (c *chClient) snapshotVM(ctx context.Context, destURL string) error {
return c.do(ctx, http.MethodPut, "/api/v1/vm.snapshot", map[string]string{
"destination_url": destURL,
})
}
// restoreVM restores a VM from a snapshot via the API. Uses OnDemand memory
// restore mode for UFFD-based lazy page loading — only pages the guest
// actually touches are faulted in from disk.
func (c *chClient) restoreVM(ctx context.Context, sourceURL string) error {
return c.do(ctx, http.MethodPut, "/api/v1/vm.restore", map[string]any{
"source_url": sourceURL,
"memory_restore_mode": "OnDemand",
"resume": true,
})
}
// shutdownVMM cleanly shuts down the Cloud Hypervisor VMM process.
func (c *chClient) shutdownVMM(ctx context.Context) error {
return c.do(ctx, http.MethodPut, "/api/v1/vmm.shutdown", nil)
}
// resizeBalloon adjusts the balloon target at runtime.
// sizeBytes is memory to take FROM the guest (0 = give all back).
func (c *chClient) resizeBalloon(ctx context.Context, sizeBytes int64) error {
return c.do(ctx, http.MethodPut, "/api/v1/vm.resize", map[string]int64{
"desired_balloon": sizeBytes,
})
}
// ping checks if the VMM is alive and ready to accept commands.
func (c *chClient) ping(ctx context.Context) error {
return c.do(ctx, http.MethodGet, "/api/v1/vmm.ping", nil)
}

View File

@ -2,13 +2,12 @@ package vm
import "fmt" import "fmt"
// VMConfig holds the configuration for creating a Firecracker microVM. // VMConfig holds the configuration for creating a Cloud Hypervisor microVM.
type VMConfig struct { type VMConfig struct {
// SandboxID is the unique identifier for this sandbox (e.g., "cl-a1b2c3d4"). // SandboxID is the unique identifier for this sandbox (e.g., "cl-a1b2c3d4").
SandboxID string SandboxID string
// TemplateID is the template UUID string used to populate MMDS metadata // TemplateID is the template UUID string, passed to envd via PostInit.
// so that envd can read WRENN_TEMPLATE_ID from inside the guest.
TemplateID string TemplateID string
// KernelPath is the path to the uncompressed Linux kernel (vmlinux). // KernelPath is the path to the uncompressed Linux kernel (vmlinux).
@ -25,12 +24,12 @@ type VMConfig struct {
MemoryMB int MemoryMB int
// NetworkNamespace is the name of the network namespace to launch // NetworkNamespace is the name of the network namespace to launch
// Firecracker inside (e.g., "ns-1"). The namespace must already exist // Cloud Hypervisor inside (e.g., "ns-1"). The namespace must already exist
// with a TAP device configured. // with a TAP device configured.
NetworkNamespace string NetworkNamespace string
// TapDevice is the name of the TAP device inside the network namespace // TapDevice is the name of the TAP device inside the network namespace
// that Firecracker will attach to (e.g., "tap0"). // that Cloud Hypervisor will attach to (e.g., "tap0").
TapDevice string TapDevice string
// TapMAC is the MAC address for the TAP device. // TapMAC is the MAC address for the TAP device.
@ -45,19 +44,23 @@ type VMConfig struct {
// NetMask is the subnet mask for the guest network (e.g., "255.255.255.252"). // NetMask is the subnet mask for the guest network (e.g., "255.255.255.252").
NetMask string NetMask string
// FirecrackerBin is the path to the firecracker binary. // VMMBin is the path to the cloud-hypervisor binary.
FirecrackerBin string VMMBin string
// SocketPath is the path for the Firecracker API Unix socket. // SocketPath is the path for the Cloud Hypervisor API Unix socket.
SocketPath string SocketPath string
// SandboxDir is the tmpfs mount point for per-sandbox files inside the // SandboxDir is the tmpfs mount point for per-sandbox files inside the
// mount namespace (e.g., "/fc-vm"). // mount namespace (e.g., "/ch-vm").
SandboxDir string SandboxDir string
// InitPath is the path to the init process inside the guest. // InitPath is the path to the init process inside the guest.
// Defaults to "/sbin/init" if empty. // Defaults to "/sbin/init" if empty.
InitPath string InitPath string
// SnapshotDir is the path to the snapshot directory for restore.
// Only set when restoring from a snapshot.
SnapshotDir string
} }
func (c *VMConfig) applyDefaults() { func (c *VMConfig) applyDefaults() {
@ -67,14 +70,14 @@ func (c *VMConfig) applyDefaults() {
if c.MemoryMB == 0 { if c.MemoryMB == 0 {
c.MemoryMB = 512 c.MemoryMB = 512
} }
if c.FirecrackerBin == "" { if c.VMMBin == "" {
c.FirecrackerBin = "/usr/local/bin/firecracker" c.VMMBin = "/usr/local/bin/cloud-hypervisor"
} }
if c.SocketPath == "" { if c.SocketPath == "" {
c.SocketPath = fmt.Sprintf("/tmp/fc-%s.sock", c.SandboxID) c.SocketPath = fmt.Sprintf("/tmp/ch-%s.sock", c.SandboxID)
} }
if c.SandboxDir == "" { if c.SandboxDir == "" {
c.SandboxDir = "/tmp/fc-vm" c.SandboxDir = "/tmp/ch-vm"
} }
if c.TapDevice == "" { if c.TapDevice == "" {
c.TapDevice = "tap0" c.TapDevice = "tap0"
@ -95,7 +98,7 @@ func (c *VMConfig) kernelArgs() string {
) )
return fmt.Sprintf( return fmt.Sprintf(
"console=ttyS0 reboot=k panic=1 pci=off quiet loglevel=1 clocksource=kvm-clock init=%s %s", "console=ttyS0 root=/dev/vda rw reboot=k panic=1 quiet loglevel=1 init_on_free=1 clocksource=kvm-clock init=%s %s",
c.InitPath, ipArg, c.InitPath, ipArg,
) )
} }

View File

@ -1,202 +0,0 @@
package vm
import (
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"net"
"net/http"
)
// fcClient talks to the Firecracker HTTP API over a Unix socket.
type fcClient struct {
http *http.Client
socketPath string
}
func newFCClient(socketPath string) *fcClient {
return &fcClient{
socketPath: socketPath,
http: &http.Client{
Transport: &http.Transport{
DialContext: func(ctx context.Context, _, _ string) (net.Conn, error) {
var d net.Dialer
return d.DialContext(ctx, "unix", socketPath)
},
},
// No global timeout — callers pass context.Context with appropriate
// deadlines. A fixed 10s timeout was too short for snapshot/resume
// operations on large-memory VMs (20GB+ memfiles).
},
}
}
func (c *fcClient) do(ctx context.Context, method, path string, body any) error {
var bodyReader io.Reader
if body != nil {
data, err := json.Marshal(body)
if err != nil {
return fmt.Errorf("marshal request body: %w", err)
}
bodyReader = bytes.NewReader(data)
}
// The host in the URL is ignored for Unix sockets; we use "localhost" by convention.
req, err := http.NewRequestWithContext(ctx, method, "http://localhost"+path, bodyReader)
if err != nil {
return fmt.Errorf("create request: %w", err)
}
if body != nil {
req.Header.Set("Content-Type", "application/json")
}
resp, err := c.http.Do(req)
if err != nil {
return fmt.Errorf("%s %s: %w", method, path, err)
}
defer resp.Body.Close()
if resp.StatusCode >= 300 {
respBody, _ := io.ReadAll(resp.Body)
return fmt.Errorf("%s %s: status %d: %s", method, path, resp.StatusCode, string(respBody))
}
return nil
}
// setBootSource configures the kernel and boot args.
func (c *fcClient) setBootSource(ctx context.Context, kernelPath, bootArgs string) error {
return c.do(ctx, http.MethodPut, "/boot-source", map[string]string{
"kernel_image_path": kernelPath,
"boot_args": bootArgs,
})
}
// setRootfsDrive configures the root filesystem drive.
func (c *fcClient) setRootfsDrive(ctx context.Context, driveID, path string, readOnly bool) error {
return c.do(ctx, http.MethodPut, "/drives/"+driveID, map[string]any{
"drive_id": driveID,
"path_on_host": path,
"is_root_device": true,
"is_read_only": readOnly,
})
}
// setNetworkInterface configures a network interface attached to a TAP device.
// A tx_rate_limiter caps sustained guest→host throughput to prevent user
// application traffic from completely saturating the TAP device and starving
// envd control traffic (PTY, exec, file ops).
func (c *fcClient) setNetworkInterface(ctx context.Context, ifaceID, tapName, macAddr string) error {
return c.do(ctx, http.MethodPut, "/network-interfaces/"+ifaceID, map[string]any{
"iface_id": ifaceID,
"host_dev_name": tapName,
"guest_mac": macAddr,
"tx_rate_limiter": map[string]any{
"bandwidth": map[string]any{
"size": 209715200, // 200 MB/s sustained
"refill_time": 1000, // refill period: 1 second
"one_time_burst": 104857600, // 100 MB initial burst
},
},
})
}
// setMachineConfig configures vCPUs, memory, and other machine settings.
func (c *fcClient) setMachineConfig(ctx context.Context, vcpus, memMB int) error {
return c.do(ctx, http.MethodPut, "/machine-config", map[string]any{
"vcpu_count": vcpus,
"mem_size_mib": memMB,
"smt": false,
})
}
// setMMDSConfig enables MMDS V2 token-based access on the given network interface.
// Must be called before startVM.
func (c *fcClient) setMMDSConfig(ctx context.Context, ifaceID string) error {
return c.do(ctx, http.MethodPut, "/mmds/config", map[string]any{
"version": "V2",
"network_interfaces": []string{ifaceID},
})
}
// mmdsMetadata is the metadata payload written to the Firecracker MMDS store.
// envd reads this via PollForMMDSOpts to populate WRENN_SANDBOX_ID and WRENN_TEMPLATE_ID.
type mmdsMetadata struct {
SandboxID string `json:"instanceID"`
TemplateID string `json:"envID"`
}
// setMMDS writes sandbox metadata to the Firecracker MMDS store.
// Can be called after the VM has started.
func (c *fcClient) setMMDS(ctx context.Context, sandboxID, templateID string) error {
return c.do(ctx, http.MethodPut, "/mmds", mmdsMetadata{
SandboxID: sandboxID,
TemplateID: templateID,
})
}
// setBalloon configures the Firecracker balloon device for dynamic memory
// management. deflateOnOom lets the guest reclaim balloon pages under memory
// pressure. statsInterval enables periodic stats via GET /balloon/statistics.
// Must be called before startVM.
func (c *fcClient) setBalloon(ctx context.Context, amountMiB int, deflateOnOom bool, statsIntervalS int) error {
return c.do(ctx, http.MethodPut, "/balloon", map[string]any{
"amount_mib": amountMiB,
"deflate_on_oom": deflateOnOom,
"stats_polling_interval_s": statsIntervalS,
})
}
// updateBalloon adjusts the balloon target at runtime.
func (c *fcClient) updateBalloon(ctx context.Context, amountMiB int) error {
return c.do(ctx, http.MethodPatch, "/balloon", map[string]any{
"amount_mib": amountMiB,
})
}
// startVM issues the InstanceStart action.
func (c *fcClient) startVM(ctx context.Context) error {
return c.do(ctx, http.MethodPut, "/actions", map[string]string{
"action_type": "InstanceStart",
})
}
// pauseVM pauses the microVM.
func (c *fcClient) pauseVM(ctx context.Context) error {
return c.do(ctx, http.MethodPatch, "/vm", map[string]string{
"state": "Paused",
})
}
// resumeVM resumes a paused microVM.
func (c *fcClient) resumeVM(ctx context.Context) error {
return c.do(ctx, http.MethodPatch, "/vm", map[string]string{
"state": "Resumed",
})
}
// createSnapshot creates a VM snapshot.
// snapshotType is "Full" (all memory) or "Diff" (only dirty pages since last resume).
func (c *fcClient) createSnapshot(ctx context.Context, snapPath, memPath, snapshotType string) error {
return c.do(ctx, http.MethodPut, "/snapshot/create", map[string]any{
"snapshot_type": snapshotType,
"snapshot_path": snapPath,
"mem_file_path": memPath,
})
}
// loadSnapshotWithUffd loads a VM snapshot using a UFFD socket for
// lazy memory loading. Firecracker will connect to the socket and
// send the uffd fd + memory region mappings.
func (c *fcClient) loadSnapshotWithUffd(ctx context.Context, snapPath, uffdSocketPath string) error {
return c.do(ctx, http.MethodPut, "/snapshot/load", map[string]any{
"snapshot_path": snapPath,
"resume_vm": false,
"mem_backend": map[string]any{
"backend_type": "Uffd",
"backend_path": uffdSocketPath,
},
})
}

View File

@ -9,14 +9,14 @@ import (
"time" "time"
) )
// VM represents a running Firecracker microVM. // VM represents a running Cloud Hypervisor microVM.
type VM struct { type VM struct {
Config VMConfig Config VMConfig
process *process process *process
client *fcClient client *chClient
} }
// Manager handles the lifecycle of Firecracker microVMs. // Manager handles the lifecycle of Cloud Hypervisor microVMs.
type Manager struct { type Manager struct {
mu sync.RWMutex mu sync.RWMutex
// vms tracks running VMs by sandbox ID. // vms tracks running VMs by sandbox ID.
@ -30,7 +30,7 @@ func NewManager() *Manager {
} }
} }
// Create boots a new Firecracker microVM with the given configuration. // Create boots a new Cloud Hypervisor microVM with the given configuration.
// The network namespace and TAP device must already be set up. // The network namespace and TAP device must already be set up.
func (m *Manager) Create(ctx context.Context, cfg VMConfig) (*VM, error) { func (m *Manager) Create(ctx context.Context, cfg VMConfig) (*VM, error) {
cfg.applyDefaults() cfg.applyDefaults()
@ -38,7 +38,6 @@ func (m *Manager) Create(ctx context.Context, cfg VMConfig) (*VM, error) {
return nil, fmt.Errorf("invalid config: %w", err) return nil, fmt.Errorf("invalid config: %w", err)
} }
// Clean up any leftover socket from a previous run.
os.Remove(cfg.SocketPath) os.Remove(cfg.SocketPath)
slog.Info("creating VM", slog.Info("creating VM",
@ -47,7 +46,7 @@ func (m *Manager) Create(ctx context.Context, cfg VMConfig) (*VM, error) {
"memory_mb", cfg.MemoryMB, "memory_mb", cfg.MemoryMB,
) )
// Step 1: Launch the Firecracker process. // Step 1: Launch the Cloud Hypervisor process.
proc, err := startProcess(ctx, &cfg) proc, err := startProcess(ctx, &cfg)
if err != nil { if err != nil {
return nil, fmt.Errorf("start process: %w", err) return nil, fmt.Errorf("start process: %w", err)
@ -59,25 +58,18 @@ func (m *Manager) Create(ctx context.Context, cfg VMConfig) (*VM, error) {
return nil, fmt.Errorf("wait for socket: %w", err) return nil, fmt.Errorf("wait for socket: %w", err)
} }
// Step 3: Configure the VM via the Firecracker API. // Step 3: Configure and boot the VM via a single API call.
client := newFCClient(cfg.SocketPath) client := newCHClient(cfg.SocketPath)
if err := configureVM(ctx, client, &cfg); err != nil { if err := client.createVM(ctx, &cfg); err != nil {
_ = proc.stop() _ = proc.stop()
return nil, fmt.Errorf("configure VM: %w", err) return nil, fmt.Errorf("create VM config: %w", err)
} }
// Step 4: Start the VM. // Step 4: Boot the VM.
if err := client.startVM(ctx); err != nil { if err := client.bootVM(ctx); err != nil {
_ = proc.stop() _ = proc.stop()
return nil, fmt.Errorf("start VM: %w", err) return nil, fmt.Errorf("boot VM: %w", err)
}
// Step 5: Push sandbox metadata into MMDS so envd can read
// WRENN_SANDBOX_ID and WRENN_TEMPLATE_ID from inside the guest.
if err := client.setMMDS(ctx, cfg.SandboxID, cfg.TemplateID); err != nil {
_ = proc.stop()
return nil, fmt.Errorf("set MMDS metadata: %w", err)
} }
vm := &VM{ vm := &VM{
@ -95,46 +87,6 @@ func (m *Manager) Create(ctx context.Context, cfg VMConfig) (*VM, error) {
return vm, nil return vm, nil
} }
// configureVM sends the configuration to Firecracker via its HTTP API.
func configureVM(ctx context.Context, client *fcClient, cfg *VMConfig) error {
// Boot source (kernel + args)
if err := client.setBootSource(ctx, cfg.KernelPath, cfg.kernelArgs()); err != nil {
return fmt.Errorf("set boot source: %w", err)
}
// Root drive — use the symlink path inside the mount namespace so that
// snapshots record a stable path that works on restore.
rootfsSymlink := cfg.SandboxDir + "/rootfs.ext4"
if err := client.setRootfsDrive(ctx, "rootfs", rootfsSymlink, false); err != nil {
return fmt.Errorf("set rootfs drive: %w", err)
}
// Network interface
if err := client.setNetworkInterface(ctx, "eth0", cfg.TapDevice, cfg.TapMAC); err != nil {
return fmt.Errorf("set network interface: %w", err)
}
// Machine config (vCPUs + memory)
if err := client.setMachineConfig(ctx, cfg.VCPUs, cfg.MemoryMB); err != nil {
return fmt.Errorf("set machine config: %w", err)
}
// Balloon device — allows the host to reclaim unused guest memory.
// Start with 0 (no inflation). deflate_on_oom lets the guest reclaim
// balloon pages under memory pressure. Stats interval enables monitoring.
if err := client.setBalloon(ctx, 0, true, 5); err != nil {
slog.Warn("set balloon failed (non-fatal, VM will run without memory reclaim)", "error", err)
}
// MMDS config — enable V2 token access on eth0 so that envd can read
// WRENN_SANDBOX_ID and WRENN_TEMPLATE_ID from inside the guest.
if err := client.setMMDSConfig(ctx, "eth0"); err != nil {
return fmt.Errorf("set MMDS config: %w", err)
}
return nil
}
// Pause pauses a running VM. // Pause pauses a running VM.
func (m *Manager) Pause(ctx context.Context, sandboxID string) error { func (m *Manager) Pause(ctx context.Context, sandboxID string) error {
m.mu.RLock() m.mu.RLock()
@ -179,7 +131,8 @@ func (m *Manager) UpdateBalloon(ctx context.Context, sandboxID string, amountMiB
return fmt.Errorf("VM not found: %s", sandboxID) return fmt.Errorf("VM not found: %s", sandboxID)
} }
return vm.client.updateBalloon(ctx, amountMiB) sizeBytes := int64(amountMiB) * 1024 * 1024
return vm.client.resizeBalloon(ctx, sizeBytes)
} }
// Destroy stops and cleans up a VM. // Destroy stops and cleans up a VM.
@ -195,12 +148,17 @@ func (m *Manager) Destroy(ctx context.Context, sandboxID string) error {
slog.Info("destroying VM", "sandbox", sandboxID) slog.Info("destroying VM", "sandbox", sandboxID)
// Stop the Firecracker process. // Try clean shutdown first, fall back to process kill.
shutdownCtx, shutdownCancel := context.WithTimeout(ctx, 5*time.Second)
if err := vm.client.shutdownVMM(shutdownCtx); err != nil {
slog.Debug("clean VMM shutdown failed, killing process", "sandbox", sandboxID, "error", err)
}
shutdownCancel()
if err := vm.process.stop(); err != nil { if err := vm.process.stop(); err != nil {
slog.Warn("error stopping process", "sandbox", sandboxID, "error", err) slog.Warn("error stopping process", "sandbox", sandboxID, "error", err)
} }
// Clean up the API socket.
os.Remove(vm.Config.SocketPath) os.Remove(vm.Config.SocketPath)
slog.Info("VM destroyed", "sandbox", sandboxID) slog.Info("VM destroyed", "sandbox", sandboxID)
@ -208,8 +166,8 @@ func (m *Manager) Destroy(ctx context.Context, sandboxID string) error {
} }
// Snapshot creates a VM snapshot. The VM must already be paused. // Snapshot creates a VM snapshot. The VM must already be paused.
// snapshotType is "Full" (all memory) or "Diff" (only dirty pages since last resume). // destURL is the file:// URL to the snapshot directory.
func (m *Manager) Snapshot(ctx context.Context, sandboxID, snapPath, memPath, snapshotType string) error { func (m *Manager) Snapshot(ctx context.Context, sandboxID, snapshotDir string) error {
m.mu.RLock() m.mu.RLock()
vm, ok := m.vms[sandboxID] vm, ok := m.vms[sandboxID]
m.mu.RUnlock() m.mu.RUnlock()
@ -217,29 +175,35 @@ func (m *Manager) Snapshot(ctx context.Context, sandboxID, snapPath, memPath, sn
return fmt.Errorf("VM not found: %s", sandboxID) return fmt.Errorf("VM not found: %s", sandboxID)
} }
if err := vm.client.createSnapshot(ctx, snapPath, memPath, snapshotType); err != nil { destURL := "file://" + snapshotDir
if err := vm.client.snapshotVM(ctx, destURL); err != nil {
return fmt.Errorf("create snapshot: %w", err) return fmt.Errorf("create snapshot: %w", err)
} }
slog.Info("VM snapshot created", "sandbox", sandboxID, "snap_path", snapPath, "type", snapshotType) slog.Info("VM snapshot created", "sandbox", sandboxID, "snapshot_dir", snapshotDir)
return nil return nil
} }
// CreateFromSnapshot boots a new Firecracker VM by loading a snapshot // CreateFromSnapshot boots a new Cloud Hypervisor VM by restoring from a
// using UFFD for lazy memory loading. The network namespace and TAP // snapshot directory. The network namespace and TAP device must already be set up.
// device must already be set up.
// //
// No boot resources (kernel, drives, machine config) are configured — // A bare CH process is started first, then the restore is performed via the API
// the snapshot carries all that state. The rootfs path recorded in the // with memory_restore_mode=OnDemand for UFFD-based lazy page loading. This means
// snapshot is resolved via a stable symlink at SandboxDir/rootfs.ext4 // only pages the guest actually touches are faulted in from disk — a 16GB template
// inside the mount namespace (created by the start script in jailer.go). // with 2GB active working set only loads ~2GB into RAM at restore time.
//
// The restore API also sets resume=true, so the VM starts running immediately
// without a separate resume call.
//
// The rootfs path recorded in the snapshot is resolved via a stable symlink at
// SandboxDir/rootfs.ext4 inside the mount namespace.
// //
// The sequence is: // The sequence is:
// 1. Start FC process in mount+network namespace (creates tmpfs + rootfs symlink) // 1. Start bare CH process in mount+network namespace
// 2. Wait for API socket // 2. Wait for API socket
// 3. Load snapshot with UFFD backend // 3. Restore VM via API (OnDemand memory + auto-resume)
// 4. Resume VM execution func (m *Manager) CreateFromSnapshot(ctx context.Context, cfg VMConfig, snapshotDir string) (*VM, error) {
func (m *Manager) CreateFromSnapshot(ctx context.Context, cfg VMConfig, snapPath, uffdSocketPath string) (*VM, error) { cfg.SnapshotDir = snapshotDir
cfg.applyDefaults() cfg.applyDefaults()
if err := cfg.validate(); err != nil { if err := cfg.validate(); err != nil {
return nil, fmt.Errorf("invalid config: %w", err) return nil, fmt.Errorf("invalid config: %w", err)
@ -249,14 +213,11 @@ func (m *Manager) CreateFromSnapshot(ctx context.Context, cfg VMConfig, snapPath
slog.Info("restoring VM from snapshot", slog.Info("restoring VM from snapshot",
"sandbox", cfg.SandboxID, "sandbox", cfg.SandboxID,
"snap_path", snapPath, "snapshot_dir", snapshotDir,
) )
// Step 1: Launch the Firecracker process. // Step 1: Launch bare CH process (no --restore).
// The start script creates a tmpfs at SandboxDir and symlinks proc, err := startProcessForRestore(ctx, &cfg)
// rootfs.ext4 → cfg.RootfsPath, so the snapshot's recorded rootfs
// path (/fc-vm/rootfs.ext4) resolves to the new clone.
proc, err := startProcess(ctx, &cfg)
if err != nil { if err != nil {
return nil, fmt.Errorf("start process: %w", err) return nil, fmt.Errorf("start process: %w", err)
} }
@ -267,26 +228,13 @@ func (m *Manager) CreateFromSnapshot(ctx context.Context, cfg VMConfig, snapPath
return nil, fmt.Errorf("wait for socket: %w", err) return nil, fmt.Errorf("wait for socket: %w", err)
} }
client := newFCClient(cfg.SocketPath) client := newCHClient(cfg.SocketPath)
// Step 3: Load the snapshot with UFFD backend. // Step 3: Restore via API with OnDemand memory + auto-resume.
// No boot resources are configured — the snapshot carries kernel, sourceURL := "file://" + snapshotDir
// drive, network, and machine config state. if err := client.restoreVM(ctx, sourceURL); err != nil {
if err := client.loadSnapshotWithUffd(ctx, snapPath, uffdSocketPath); err != nil {
_ = proc.stop() _ = proc.stop()
return nil, fmt.Errorf("load snapshot: %w", err) return nil, fmt.Errorf("restore VM: %w", err)
}
// Step 4: Resume the VM.
if err := client.resumeVM(ctx); err != nil {
_ = proc.stop()
return nil, fmt.Errorf("resume VM: %w", err)
}
// Step 5: Push sandbox metadata into MMDS.
if err := client.setMMDS(ctx, cfg.SandboxID, cfg.TemplateID); err != nil {
_ = proc.stop()
return nil, fmt.Errorf("set MMDS metadata: %w", err)
} }
vm := &VM{ vm := &VM{
@ -304,11 +252,15 @@ func (m *Manager) CreateFromSnapshot(ctx context.Context, cfg VMConfig, snapPath
} }
// PID returns the process ID of the unshare wrapper process. // PID returns the process ID of the unshare wrapper process.
// The actual Firecracker process is a direct child of this PID.
func (v *VM) PID() int { func (v *VM) PID() int {
return v.process.cmd.Process.Pid return v.process.cmd.Process.Pid
} }
// Exited returns a channel that is closed when the VM process exits.
func (v *VM) Exited() <-chan struct{} {
return v.process.exited()
}
// Get returns a running VM by sandbox ID. // Get returns a running VM by sandbox ID.
func (m *Manager) Get(sandboxID string) (*VM, bool) { func (m *Manager) Get(sandboxID string) (*VM, bool) {
m.mu.RLock() m.mu.RLock()
@ -317,7 +269,7 @@ func (m *Manager) Get(sandboxID string) (*VM, bool) {
return vm, ok return vm, ok
} }
// waitForSocket polls for the Firecracker API socket to appear on disk. // waitForSocket polls for the Cloud Hypervisor API socket to appear on disk.
func waitForSocket(ctx context.Context, socketPath string, proc *process) error { func waitForSocket(ctx context.Context, socketPath string, proc *process) error {
ticker := time.NewTicker(10 * time.Millisecond) ticker := time.NewTicker(10 * time.Millisecond)
defer ticker.Stop() defer ticker.Stop()
@ -329,7 +281,7 @@ func waitForSocket(ctx context.Context, socketPath string, proc *process) error
case <-ctx.Done(): case <-ctx.Done():
return ctx.Err() return ctx.Err()
case <-proc.exited(): case <-proc.exited():
return fmt.Errorf("firecracker process exited before socket was ready") return fmt.Errorf("cloud-hypervisor process exited before socket was ready")
case <-timeout: case <-timeout:
return fmt.Errorf("timed out waiting for API socket at %s", socketPath) return fmt.Errorf("timed out waiting for API socket at %s", socketPath)
case <-ticker.C: case <-ticker.C:

View File

@ -10,7 +10,7 @@ import (
"time" "time"
) )
// process represents a running Firecracker process with mount and network // process represents a running Cloud Hypervisor process with mount and network
// namespace isolation. // namespace isolation.
type process struct { type process struct {
cmd *exec.Cmd cmd *exec.Cmd
@ -20,33 +20,42 @@ type process struct {
exitErr error exitErr error
} }
// startProcess launches the Firecracker binary inside an isolated mount namespace // startProcess launches the Cloud Hypervisor binary inside an isolated mount
// and the specified network namespace. The launch sequence: // namespace and the specified network namespace. Used for fresh boot (no
// snapshot). The launch sequence:
// //
// 1. unshare -m: creates a private mount namespace // 1. unshare -m: creates a private mount namespace
// 2. mount --make-rprivate /: prevents mount propagation to host // 2. mount --make-rprivate /: prevents mount propagation to host
// 3. mount tmpfs at SandboxDir: ephemeral workspace for this VM // 3. mount tmpfs at SandboxDir: ephemeral workspace for this VM
// 4. symlink kernel and rootfs into SandboxDir // 4. symlink kernel and rootfs into SandboxDir
// 5. ip netns exec <ns>: enters the network namespace where TAP is configured // 5. ip netns exec <ns>: enters the network namespace where TAP is configured
// 6. exec firecracker with the API socket path // 6. exec cloud-hypervisor with the API socket path
func startProcess(ctx context.Context, cfg *VMConfig) (*process, error) { func startProcess(ctx context.Context, cfg *VMConfig) (*process, error) {
// Use a background context for the long-lived Firecracker process.
// The request context (ctx) is only used for the startup phase — we must
// not tie the VM's lifetime to the HTTP request that created it.
execCtx, cancel := context.WithCancel(context.Background())
script := buildStartScript(cfg) script := buildStartScript(cfg)
return launchScript(script, cfg)
}
// startProcessForRestore launches a bare Cloud Hypervisor process (no --restore).
// The restore is performed via the API after the socket is ready, which allows
// passing memory_restore_mode=OnDemand for UFFD lazy paging.
func startProcessForRestore(ctx context.Context, cfg *VMConfig) (*process, error) {
script := buildRestoreScript(cfg)
return launchScript(script, cfg)
}
func launchScript(script string, cfg *VMConfig) (*process, error) {
execCtx, cancel := context.WithCancel(context.Background())
cmd := exec.CommandContext(execCtx, "unshare", "-m", "--", "bash", "-c", script) cmd := exec.CommandContext(execCtx, "unshare", "-m", "--", "bash", "-c", script)
cmd.SysProcAttr = &syscall.SysProcAttr{ cmd.SysProcAttr = &syscall.SysProcAttr{
Setsid: true, // new session so signals don't propagate from parent Setsid: true,
} }
cmd.Stdout = os.Stdout cmd.Stdout = os.Stdout
cmd.Stderr = os.Stderr cmd.Stderr = os.Stderr
if err := cmd.Start(); err != nil { if err := cmd.Start(); err != nil {
cancel() cancel()
return nil, fmt.Errorf("start firecracker process: %w", err) return nil, fmt.Errorf("start cloud-hypervisor process: %w", err)
} }
p := &process{ p := &process{
@ -60,7 +69,7 @@ func startProcess(ctx context.Context, cfg *VMConfig) (*process, error) {
close(p.exitCh) close(p.exitCh)
}() }()
slog.Info("firecracker process started", slog.Info("cloud-hypervisor process started",
"pid", cmd.Process.Pid, "pid", cmd.Process.Pid,
"sandbox", cfg.SandboxID, "sandbox", cfg.SandboxID,
) )
@ -68,35 +77,56 @@ func startProcess(ctx context.Context, cfg *VMConfig) (*process, error) {
return p, nil return p, nil
} }
// buildStartScript generates the bash script that sets up the mount namespace, // buildStartScript generates the bash script for fresh boot: sets up mount
// symlinks kernel/rootfs, and execs Firecracker inside the network namespace. // namespace, symlinks kernel/rootfs, and execs Cloud Hypervisor.
func buildStartScript(cfg *VMConfig) string { func buildStartScript(cfg *VMConfig) string {
return fmt.Sprintf(` return fmt.Sprintf(`
set -euo pipefail set -euo pipefail
# Prevent mount propagation to the host
mount --make-rprivate / mount --make-rprivate /
# Create ephemeral tmpfs workspace
mkdir -p %[1]s mkdir -p %[1]s
mount -t tmpfs tmpfs %[1]s mount -t tmpfs tmpfs %[1]s
# Symlink kernel and rootfs into the workspace
ln -s %[2]s %[1]s/vmlinux ln -s %[2]s %[1]s/vmlinux
ln -s %[3]s %[1]s/rootfs.ext4 ln -s %[3]s %[1]s/rootfs.ext4
# Launch Firecracker inside the network namespace exec ip netns exec %[4]s %[5]s --api-socket path=%[6]s
exec ip netns exec %[4]s %[5]s --api-sock %[6]s
`, `,
cfg.SandboxDir, // 1 cfg.SandboxDir, // 1
cfg.KernelPath, // 2 cfg.KernelPath, // 2
cfg.RootfsPath, // 3 cfg.RootfsPath, // 3
cfg.NetworkNamespace, // 4 cfg.NetworkNamespace, // 4
cfg.FirecrackerBin, // 5 cfg.VMMBin, // 5
cfg.SocketPath, // 6 cfg.SocketPath, // 6
) )
} }
// buildRestoreScript generates the bash script for snapshot restore: sets up
// mount namespace, symlinks rootfs, and starts a bare Cloud Hypervisor process.
// The actual restore is done via the API (PUT /vm.restore) after the socket is
// ready, which enables memory_restore_mode=OnDemand for UFFD lazy paging.
func buildRestoreScript(cfg *VMConfig) string {
return fmt.Sprintf(`
set -euo pipefail
mount --make-rprivate /
mkdir -p %[1]s
mount -t tmpfs tmpfs %[1]s
ln -s %[2]s %[1]s/rootfs.ext4
exec ip netns exec %[3]s %[4]s --api-socket path=%[5]s
`,
cfg.SandboxDir, // 1
cfg.RootfsPath, // 2
cfg.NetworkNamespace, // 3
cfg.VMMBin, // 4
cfg.SocketPath, // 5
)
}
// stop sends SIGTERM and waits for the process to exit. If it doesn't exit // stop sends SIGTERM and waits for the process to exit. If it doesn't exit
// within 10 seconds, SIGKILL is sent. // within 10 seconds, SIGKILL is sent.
func (p *process) stop() error { func (p *process) stop() error {
@ -104,7 +134,6 @@ func (p *process) stop() error {
return nil return nil
} }
// Send SIGTERM to the process group (negative PID).
if err := syscall.Kill(-p.cmd.Process.Pid, syscall.SIGTERM); err != nil { if err := syscall.Kill(-p.cmd.Process.Pid, syscall.SIGTERM); err != nil {
slog.Debug("sigterm failed, process may have exited", "error", err) slog.Debug("sigterm failed, process may have exited", "error", err)
} }
@ -113,7 +142,7 @@ func (p *process) stop() error {
case <-p.exitCh: case <-p.exitCh:
return nil return nil
case <-time.After(10 * time.Second): case <-time.After(10 * time.Second):
slog.Warn("firecracker did not exit after SIGTERM, sending SIGKILL") slog.Warn("cloud-hypervisor did not exit after SIGTERM, sending SIGKILL")
if err := syscall.Kill(-p.cmd.Process.Pid, syscall.SIGKILL); err != nil { if err := syscall.Kill(-p.cmd.Process.Pid, syscall.SIGKILL); err != nil {
slog.Debug("sigkill failed", "error", err) slog.Debug("sigkill failed", "error", err)
} }

View File

@ -177,10 +177,11 @@ func Run(opts ...Option) {
Config: cfg, Config: cfg,
} }
// Host monitor (passive + active reconciliation every 60s). // Host monitor (safety-net reconciliation every 5 minutes).
// Created before API server so the heartbeat handler can trigger immediate // Primary state sync is push-based (host agent callbacks + CP background
// reconciliation when a host recovers from unreachable. // goroutines). The monitor acts as a fallback for missed events, host death
monitor := api.NewHostMonitor(queries, hostPool, al, 60*time.Second) // detection, and transient status resolution.
monitor := api.NewHostMonitor(queries, hostPool, al, 5*time.Minute)
// API server. // API server.
srv := api.New(queries, hostPool, hostScheduler, pool, rdb, []byte(cfg.JWTSecret), oauthRegistry, cfg.OAuthRedirectURL, ca, al, channelSvc, mailer, o.extensions, sctx, monitor, o.version) srv := api.New(queries, hostPool, hostScheduler, pool, rdb, []byte(cfg.JWTSecret), oauthRegistry, cfg.OAuthRedirectURL, ca, al, channelSvc, mailer, o.extensions, sctx, monitor, o.version)

View File

@ -326,7 +326,7 @@ func (s *BuildService) executeBuild(ctx context.Context, buildIDStr string) {
s.failBuild(buildCtx, buildID, fmt.Sprintf("create sandbox failed: %v", err)) s.failBuild(buildCtx, buildID, fmt.Sprintf("create sandbox failed: %v", err))
return return
} }
// Capture sandbox metadata (envd/kernel/firecracker/agent versions). // Capture sandbox metadata (envd/kernel/vmm/agent versions).
sandboxMetadata := resp.Msg.Metadata sandboxMetadata := resp.Msg.Metadata
// Record sandbox/host association. // Record sandbox/host association.
@ -768,7 +768,7 @@ var runtimeEnvVars = map[string]bool{
"HOME": true, "USER": true, "LOGNAME": true, "SHELL": true, "HOME": true, "USER": true, "LOGNAME": true, "SHELL": true,
"PWD": true, "OLDPWD": true, "HOSTNAME": true, "TERM": true, "PWD": true, "OLDPWD": true, "HOSTNAME": true, "TERM": true,
"SHLVL": true, "_": true, "SHLVL": true, "_": true,
// Per-sandbox identifiers set by envd at boot via MMDS. // Per-sandbox identifiers set by envd at boot via PostInit.
"WRENN_SANDBOX_ID": true, "WRENN_TEMPLATE_ID": true, "WRENN_SANDBOX_ID": true, "WRENN_TEMPLATE_ID": true,
} }