From eaa6b8576dc2184a76ae93f7335dd7ea5711912e Mon Sep 17 00:00:00 2001 From: pptx704 Date: Sun, 17 May 2026 01:33:12 +0600 Subject: [PATCH] feat(vm): replace Firecracker with Cloud Hypervisor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Migrate the entire VM layer from Firecracker to Cloud Hypervisor (CH). CH provides native snapshot/restore via its HTTP API, eliminating the need for custom UFFD handling, memfile processing, and snapshot header management that Firecracker required. Key changes: - Remove fc.go, jailer.go (FC process management) - Remove internal/uffd/ package (userfaultfd lazy page loading) - Remove snapshot/header.go, mapping.go, memfile.go (FC snapshot format) - Add ch.go (CH HTTP API client over Unix socket) - Add process.go (CH process lifecycle with unshare+netns) - Add chversion.go (CH version detection) - Refactor sandbox manager: remove UFFD socket tracking, snapshot parent/diff chaining, FC-specific balloon logic; add crash watcher - Simplify snapshot/local.go to CH's native snapshot format - Update VM config: FirecrackerBin → VMMBin, new CH-specific fields - Update envdclient, devicemapper, network for CH compatibility --- cmd/host-agent/main.go | 24 +- internal/api/host_monitor.go | 17 +- internal/api/sandbox_event_consumer.go | 32 +- internal/devicemapper/devicemapper.go | 40 +- internal/envdclient/client.go | 40 +- internal/layout/layout.go | 2 +- internal/network/setup.go | 2 +- internal/sandbox/chversion.go | 28 ++ internal/sandbox/fcversion.go | 30 -- internal/sandbox/manager.go | 662 ++++++++++--------------- internal/sandbox/proc.go | 13 +- internal/snapshot/header.go | 221 --------- internal/snapshot/local.go | 133 +---- internal/snapshot/mapping.go | 214 -------- internal/snapshot/memfile.go | 285 ----------- internal/uffd/fd.go | 92 ---- internal/uffd/region.go | 41 -- internal/uffd/server.go | 451 ----------------- internal/vm/ch.go | 213 ++++++++ internal/vm/config.go | 31 +- internal/vm/fc.go | 202 -------- internal/vm/manager.go | 162 +++--- internal/vm/{jailer.go => process.go} | 73 ++- pkg/cpserver/run.go | 9 +- pkg/service/build.go | 4 +- 25 files changed, 754 insertions(+), 2267 deletions(-) create mode 100644 internal/sandbox/chversion.go delete mode 100644 internal/sandbox/fcversion.go delete mode 100644 internal/snapshot/header.go delete mode 100644 internal/snapshot/mapping.go delete mode 100644 internal/snapshot/memfile.go delete mode 100644 internal/uffd/fd.go delete mode 100644 internal/uffd/region.go delete mode 100644 internal/uffd/server.go create mode 100644 internal/vm/ch.go delete mode 100644 internal/vm/fc.go rename internal/vm/{jailer.go => process.go} (53%) diff --git a/cmd/host-agent/main.go b/cmd/host-agent/main.go index ddfc82c..0337b80 100644 --- a/cmd/host-agent/main.go +++ b/cmd/host-agent/main.go @@ -126,22 +126,22 @@ func main() { } slog.Info("resolved kernel", "version", kernelVersion, "path", kernelPath) - // Detect firecracker version. - fcBin := envOrDefault("WRENN_FIRECRACKER_BIN", "/usr/local/bin/firecracker") - fcVersion, err := sandbox.DetectFirecrackerVersion(fcBin) + // Detect cloud-hypervisor version. + chBin := envOrDefault("WRENN_CH_BIN", "/usr/local/bin/cloud-hypervisor") + chVersion, err := sandbox.DetectCHVersion(chBin) if err != nil { - slog.Error("failed to detect firecracker version", "error", err) + slog.Error("failed to detect cloud-hypervisor version", "error", err) os.Exit(1) } - slog.Info("resolved firecracker", "version", fcVersion, "path", fcBin) + slog.Info("resolved cloud-hypervisor", "version", chVersion, "path", chBin) cfg := sandbox.Config{ WrennDir: rootDir, DefaultRootfsSizeMB: defaultRootfsSizeMB, KernelPath: kernelPath, KernelVersion: kernelVersion, - FirecrackerBin: fcBin, - FirecrackerVersion: fcVersion, + VMMBin: chBin, + VMMVersion: chVersion, AgentVersion: version, } @@ -245,12 +245,16 @@ func main() { }, ) - // Graceful shutdown on SIGINT/SIGTERM. + // Graceful shutdown on SIGINT/SIGTERM. A second signal force-exits + // so the operator can always kill the process if shutdown hangs. sigCh := make(chan os.Signal, 1) signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM) go func() { sig := <-sigCh - doShutdown("signal: " + sig.String()) + go doShutdown("signal: " + sig.String()) + sig = <-sigCh + slog.Error("received second signal, force exiting", "signal", sig.String()) + os.Exit(1) }() slog.Info("host agent starting", "addr", listenAddr, "host_id", creds.HostID, "version", version, "commit", commit) @@ -292,7 +296,7 @@ func checkPrivileges() error { name string }{ {1, "CAP_DAC_OVERRIDE"}, // /dev/loop*, /dev/mapper/*, /dev/net/tun - {5, "CAP_KILL"}, // SIGTERM/SIGKILL to Firecracker processes + {5, "CAP_KILL"}, // SIGTERM/SIGKILL to cloud-hypervisor processes {12, "CAP_NET_ADMIN"}, // netlink, iptables, routing, TAP/veth {13, "CAP_NET_RAW"}, // raw sockets (iptables) {19, "CAP_SYS_PTRACE"}, // reading /proc/self/ns/net (netns.Get) diff --git a/internal/api/host_monitor.go b/internal/api/host_monitor.go index 06b3c28..0a3c36c 100644 --- a/internal/api/host_monitor.go +++ b/internal/api/host_monitor.go @@ -19,6 +19,12 @@ import ( // it is considered unreachable (3 missed 30-second heartbeats). const unreachableThreshold = 90 * time.Second +// transientGracePeriod is how long a sandbox is allowed to stay in a transient +// status (starting, resuming, pausing, stopping) before the monitor infers a +// final state. This prevents the monitor from racing against in-flight RPCs +// that may not have registered the sandbox on the host agent yet. +const transientGracePeriod = 2 * time.Minute + // HostMonitor runs on a fixed interval and performs two duties: // // 1. Passive check: marks hosts whose last_heartbeat_at is stale as @@ -257,7 +263,16 @@ func (m *HostMonitor) checkHost(ctx context.Context, host db.Host) { } continue } - // Sandbox is not alive on host — infer final state. + // Sandbox is not alive on host. If the transition is recent, give the + // in-flight RPC time to finish before declaring a final state. + if sb.LastUpdated.Valid && time.Since(sb.LastUpdated.Time) < transientGracePeriod { + slog.Debug("host monitor: transient sandbox still within grace period", + "sandbox_id", sbIDStr, "status", sb.Status, + "age", time.Since(sb.LastUpdated.Time).Round(time.Second)) + continue + } + + // Grace period expired — infer final state. var finalStatus string switch sb.Status { case "starting", "resuming": diff --git a/internal/api/sandbox_event_consumer.go b/internal/api/sandbox_event_consumer.go index fcd3d94..e5a4ad9 100644 --- a/internal/api/sandbox_event_consumer.go +++ b/internal/api/sandbox_event_consumer.go @@ -42,6 +42,7 @@ const ( SandboxEventResumed = "sandbox.resumed" SandboxEventStopped = "sandbox.stopped" SandboxEventFailed = "sandbox.failed" + SandboxEventError = "sandbox.error" SandboxEventAutoPaused = "sandbox.auto_paused" ) @@ -141,7 +142,7 @@ func (c *SandboxEventConsumer) handleMessage(ctx context.Context, msg redis.XMes c.handlePaused(ctx, sandboxID, event) case SandboxEventStopped: c.handleStopped(ctx, sandboxID, event) - case SandboxEventFailed: + case SandboxEventFailed, SandboxEventError: c.handleFailed(ctx, sandboxID) case SandboxEventAutoPaused: c.handleAutoPaused(ctx, sandboxID, event) @@ -187,20 +188,39 @@ func (c *SandboxEventConsumer) handlePaused(ctx context.Context, sandboxID pgtyp } func (c *SandboxEventConsumer) handleStopped(ctx context.Context, sandboxID pgtype.UUID, event SandboxEvent) { + // Try stopping → stopped (CP-initiated destroy completed). if _, err := c.db.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{ ID: sandboxID, Status: "stopping", Status_2: "stopped", + }); err == nil { + return + } + // Try running → stopped (autonomous destroy, e.g. TTL auto-destroy). + if _, err := c.db.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{ + ID: sandboxID, + Status: "running", + Status_2: "stopped", }); err != nil && !errors.Is(err, pgx.ErrNoRows) { slog.Warn("sandbox event consumer: failed to update sandbox to stopped", "sandbox_id", event.SandboxID, "error", err) } } -// handleFailed is a no-op fallback — the background goroutine already -// performed the conditional DB update before publishing this event. -// We keep the case arm so unknown event types are flagged, but avoid -// an unconditional status write that could clobber concurrent operations. -func (c *SandboxEventConsumer) handleFailed(_ context.Context, _ pgtype.UUID) {} +// handleFailed marks a sandbox as "error" when the host agent reports a crash +// or the CP's background goroutine publishes a failure. Uses conditional update +// to avoid clobbering concurrent operations. +func (c *SandboxEventConsumer) handleFailed(ctx context.Context, sandboxID pgtype.UUID) { + // Try running → error (VM crash pushed by host agent). + if _, err := c.db.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{ + ID: sandboxID, Status: "running", Status_2: "error", + }); err == nil { + return + } + // Try starting → error (create failed). + _, _ = c.db.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{ + ID: sandboxID, Status: "starting", Status_2: "error", + }) +} func (c *SandboxEventConsumer) handleAutoPaused(ctx context.Context, sandboxID pgtype.UUID, _ SandboxEvent) { sb, err := c.db.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{ diff --git a/internal/devicemapper/devicemapper.go b/internal/devicemapper/devicemapper.go index 9fa0833..f65ec6a 100644 --- a/internal/devicemapper/devicemapper.go +++ b/internal/devicemapper/devicemapper.go @@ -80,8 +80,8 @@ func (r *LoopRegistry) Release(imagePath string) { e.refcount-- if e.refcount <= 0 { - if err := losetupDetach(e.device); err != nil { - slog.Warn("losetup detach failed", "device", e.device, "error", err) + if err := losetupDetachRetry(e.device); err != nil { + slog.Error("losetup detach failed, loop device leaked", "device", e.device, "image", imagePath, "error", err) } delete(r.entries, imagePath) slog.Info("loop device released", "image", imagePath, "device", e.device) @@ -94,8 +94,8 @@ func (r *LoopRegistry) ReleaseAll() { defer r.mu.Unlock() for path, e := range r.entries { - if err := losetupDetach(e.device); err != nil { - slog.Warn("losetup detach failed", "device", e.device, "error", err) + if err := losetupDetachRetry(e.device); err != nil { + slog.Error("losetup detach failed during shutdown", "device", e.device, "image", path, "error", err) } delete(r.entries, path) } @@ -134,8 +134,8 @@ func CreateSnapshot(name, originLoopDev, cowPath string, originSizeBytes, cowSiz // space to store all modified blocks (it's sparse, so 20GB costs nothing). sectors := originSizeBytes / 512 if err := dmsetupCreate(name, originLoopDev, cowLoopDev, sectors); err != nil { - if detachErr := losetupDetach(cowLoopDev); detachErr != nil { - slog.Warn("cow losetup detach failed during cleanup", "device", cowLoopDev, "error", detachErr) + if detachErr := losetupDetachRetry(cowLoopDev); detachErr != nil { + slog.Error("cow losetup detach failed during cleanup, loop device leaked", "device", cowLoopDev, "error", detachErr) } os.Remove(cowPath) return nil, fmt.Errorf("dmsetup create: %w", err) @@ -178,8 +178,8 @@ func RestoreSnapshot(ctx context.Context, name, originLoopDev, cowPath string, o sectors := originSizeBytes / 512 if err := dmsetupCreate(name, originLoopDev, cowLoopDev, sectors); err != nil { - if detachErr := losetupDetach(cowLoopDev); detachErr != nil { - slog.Warn("cow losetup detach failed during cleanup", "device", cowLoopDev, "error", detachErr) + if detachErr := losetupDetachRetry(cowLoopDev); detachErr != nil { + slog.Error("cow losetup detach failed during cleanup, loop device leaked", "device", cowLoopDev, "error", detachErr) } return nil, fmt.Errorf("dmsetup create: %w", err) } @@ -208,8 +208,8 @@ func RemoveSnapshot(ctx context.Context, dev *SnapshotDevice) error { return fmt.Errorf("dmsetup remove %s: %w", dev.Name, err) } - if err := losetupDetach(dev.CowLoopDev); err != nil { - slog.Warn("cow losetup detach failed", "device", dev.CowLoopDev, "error", err) + if err := losetupDetachRetry(dev.CowLoopDev); err != nil { + return fmt.Errorf("detach cow loop %s: %w", dev.CowLoopDev, err) } slog.Info("dm-snapshot removed", "name", dev.Name) @@ -297,6 +297,24 @@ func losetupDetach(dev string) error { return exec.Command("losetup", "-d", dev).Run() } +// losetupDetachRetry detaches a loop device with retries for transient +// "device busy" errors (kernel may still hold references briefly after +// dm-snapshot removal). +func losetupDetachRetry(dev string) error { + var lastErr error + for attempt := range 5 { + if attempt > 0 { + time.Sleep(200 * time.Millisecond) + } + if err := losetupDetach(dev); err == nil { + return nil + } else { + lastErr = err + } + } + return fmt.Errorf("after 5 attempts: %w", lastErr) +} + // dmsetupCreate creates a dm-snapshot device with persistent metadata. func dmsetupCreate(name, originDev, cowDev string, sectors int64) error { // Table format: snapshot P @@ -316,7 +334,7 @@ func dmDeviceExists(name string) bool { // dmsetupRemove removes a device-mapper device, retrying on transient // "device busy" errors that occur when the kernel hasn't fully released -// the device after a Firecracker process exits. +// the device after a VMM process exits. func dmsetupRemove(ctx context.Context, name string) error { var lastErr error for attempt := range 5 { diff --git a/internal/envdclient/client.go b/internal/envdclient/client.go index 2229093..eaf4d4c 100644 --- a/internal/envdclient/client.go +++ b/internal/envdclient/client.go @@ -294,7 +294,7 @@ func (c *Client) ReadFile(ctx context.Context, path string) ([]byte, error) { // PrepareSnapshot calls envd's POST /snapshot/prepare endpoint, which stops // the port scanner/forwarder and marks active connections for post-restore -// cleanup before Firecracker freezes vCPUs. +// cleanup before the VMM freezes vCPUs. // // Best-effort: the caller should log a warning on error but not abort the pause. func (c *Client) PrepareSnapshot(ctx context.Context) error { @@ -317,27 +317,33 @@ func (c *Client) PrepareSnapshot(ctx context.Context) error { return nil } -// PostInit calls envd's POST /init endpoint, which triggers a re-read of -// Firecracker MMDS metadata. This updates WRENN_SANDBOX_ID, WRENN_TEMPLATE_ID -// env vars and the corresponding files under /run/wrenn/ inside the guest. -// Must be called after snapshot restore so envd picks up the new sandbox's metadata. +// PostInit calls envd's POST /init endpoint to trigger post-boot or +// post-restore initialization. sandbox_id and template_id are passed +// so envd can set WRENN_SANDBOX_ID and WRENN_TEMPLATE_ID env vars. func (c *Client) PostInit(ctx context.Context) error { - return c.PostInitWithDefaults(ctx, "", nil) + return c.PostInitWithDefaults(ctx, "", nil, "", "") } // PostInitWithDefaults calls envd's POST /init endpoint with optional default -// user and environment variables. These are applied to envd's defaults so all -// subsequent process executions use them. -func (c *Client) PostInitWithDefaults(ctx context.Context, defaultUser string, envVars map[string]string) error { +// user, environment variables, and sandbox metadata. These are applied to +// envd's defaults so all subsequent process executions use them. +func (c *Client) PostInitWithDefaults(ctx context.Context, defaultUser string, envVars map[string]string, sandboxID, templateID string) error { + payload := make(map[string]any) + if defaultUser != "" { + payload["defaultUser"] = defaultUser + } + if len(envVars) > 0 { + payload["envVars"] = envVars + } + if sandboxID != "" { + payload["sandbox_id"] = sandboxID + } + if templateID != "" { + payload["template_id"] = templateID + } + var body io.Reader - if defaultUser != "" || len(envVars) > 0 { - payload := make(map[string]any) - if defaultUser != "" { - payload["defaultUser"] = defaultUser - } - if len(envVars) > 0 { - payload["envVars"] = envVars - } + if len(payload) > 0 { data, err := json.Marshal(payload) if err != nil { return fmt.Errorf("marshal init body: %w", err) diff --git a/internal/layout/layout.go b/internal/layout/layout.go index fcb11ad..42e2cff 100644 --- a/internal/layout/layout.go +++ b/internal/layout/layout.go @@ -46,7 +46,7 @@ func SandboxesDir(wrennDir string) string { return filepath.Join(wrennDir, "sandboxes") } -// KernelPath returns the path to the Firecracker kernel. +// KernelPath returns the path to the VM kernel. func KernelPath(wrennDir string) string { return filepath.Join(wrennDir, "kernels", "vmlinux") } diff --git a/internal/network/setup.go b/internal/network/setup.go index d68da89..23ab7e3 100644 --- a/internal/network/setup.go +++ b/internal/network/setup.go @@ -176,7 +176,7 @@ func NewSlot(index int) *Slot { // CreateNetwork sets up the full network topology for a sandbox: // - Named network namespace // - Veth pair bridging host and namespace -// - TAP device inside namespace for Firecracker +// - TAP device inside namespace for Cloud Hypervisor // - Routes and NAT rules for connectivity // // On error, all partially created resources are rolled back. diff --git a/internal/sandbox/chversion.go b/internal/sandbox/chversion.go new file mode 100644 index 0000000..96d96b3 --- /dev/null +++ b/internal/sandbox/chversion.go @@ -0,0 +1,28 @@ +package sandbox + +import ( + "fmt" + "os/exec" + "strings" +) + +// DetectCHVersion runs the cloud-hypervisor binary with --version and +// parses the semver from the output (e.g. "cloud-hypervisor v43.0" → "43.0"). +func DetectCHVersion(binaryPath string) (string, error) { + out, err := exec.Command(binaryPath, "--version").Output() + if err != nil { + return "", fmt.Errorf("run %s --version: %w", binaryPath, err) + } + + line := strings.TrimSpace(string(out)) + for field := range strings.FieldsSeq(line) { + v := strings.TrimPrefix(field, "v") + if v != field || strings.Contains(field, ".") { + if strings.Count(v, ".") >= 1 { + return v, nil + } + } + } + + return "", fmt.Errorf("could not parse version from cloud-hypervisor output: %q", line) +} diff --git a/internal/sandbox/fcversion.go b/internal/sandbox/fcversion.go deleted file mode 100644 index 092fbe0..0000000 --- a/internal/sandbox/fcversion.go +++ /dev/null @@ -1,30 +0,0 @@ -package sandbox - -import ( - "fmt" - "os/exec" - "strings" -) - -// DetectFirecrackerVersion runs the firecracker binary with --version and -// parses the semver from the output (e.g. "Firecracker v1.14.1" → "1.14.1"). -func DetectFirecrackerVersion(binaryPath string) (string, error) { - out, err := exec.Command(binaryPath, "--version").Output() - if err != nil { - return "", fmt.Errorf("run %s --version: %w", binaryPath, err) - } - - // Output is typically "Firecracker v1.14.1\n" or similar. - line := strings.TrimSpace(string(out)) - for _, field := range strings.Fields(line) { - v := strings.TrimPrefix(field, "v") - if v != field || strings.Contains(field, ".") { - // Either had a "v" prefix or contains a dot — likely the version. - if strings.Count(v, ".") >= 1 { - return v, nil - } - } - } - - return "", fmt.Errorf("could not parse version from firecracker output: %q", line) -} diff --git a/internal/sandbox/manager.go b/internal/sandbox/manager.go index 11fd5df..8b3f85d 100644 --- a/internal/sandbox/manager.go +++ b/internal/sandbox/manager.go @@ -8,11 +8,9 @@ import ( "os" "os/exec" "path/filepath" - "strings" "sync" "time" - "github.com/google/uuid" "github.com/jackc/pgx/v5/pgtype" "git.omukk.dev/wrenn/wrenn/internal/devicemapper" @@ -21,7 +19,6 @@ import ( "git.omukk.dev/wrenn/wrenn/internal/models" "git.omukk.dev/wrenn/wrenn/internal/network" "git.omukk.dev/wrenn/wrenn/internal/snapshot" - "git.omukk.dev/wrenn/wrenn/internal/uffd" "git.omukk.dev/wrenn/wrenn/internal/vm" "git.omukk.dev/wrenn/wrenn/pkg/id" envdpb "git.omukk.dev/wrenn/wrenn/proto/envd/gen" @@ -34,11 +31,11 @@ type Config struct { DefaultRootfsSizeMB int // target size for template rootfs images; 0 → DefaultDiskSizeMB // Resolved at startup by the host agent. - KernelPath string // path to the latest vmlinux-x.y.z - KernelVersion string // semver extracted from filename - FirecrackerBin string // path to the firecracker binary - FirecrackerVersion string // semver from firecracker --version - AgentVersion string // host agent version (injected via ldflags) + KernelPath string // path to the latest vmlinux-x.y.z + KernelVersion string // semver extracted from filename + VMMBin string // path to the cloud-hypervisor binary + VMMVersion string // semver from cloud-hypervisor --version + AgentVersion string // host agent version (injected via ldflags) } // LifecycleEvent describes an autonomous state change initiated by the agent. @@ -88,45 +85,26 @@ func (m *Manager) SetEventSender(sender EventSender) { // sandboxState holds the runtime state for a single sandbox. type sandboxState struct { models.Sandbox - lifecycleMu sync.Mutex // serializes Pause/Destroy/Resume on this sandbox - slot *network.Slot - client *envdclient.Client - connTracker *ConnTracker // tracks in-flight proxy connections for pre-pause drain - uffdSocketPath string // non-empty for sandboxes restored from snapshot - dmDevice *devicemapper.SnapshotDevice - baseImagePath string // path to the base template rootfs (for loop registry release) - - // parent holds the snapshot header and diff file paths from which this - // sandbox was restored. Non-nil means re-pause should use "Diff" snapshot - // type instead of "Full", avoiding the UFFD fault-in storm. - parent *snapshotParent + lifecycleMu sync.Mutex // serializes Pause/Destroy/Resume on this sandbox + slot *network.Slot + client *envdclient.Client + connTracker *ConnTracker // tracks in-flight proxy connections for pre-pause drain + dmDevice *devicemapper.SnapshotDevice + baseImagePath string // path to the base template rootfs (for loop registry release) // Metrics sampling state. - fcPID int // Firecracker process PID (child of unshare wrapper) + vmmPID int // VMM process PID (child of unshare wrapper) ring *metricsRing // tiered ring buffers for CPU/mem/disk metrics samplerCancel context.CancelFunc // cancels the per-sandbox sampling goroutine samplerDone chan struct{} // closed when the sampling goroutine exits } -// snapshotParent stores the previous generation's snapshot state so that -// re-pause can produce an incremental diff instead of a full memory dump. -type snapshotParent struct { - header *snapshot.Header - diffPaths map[string]string // build ID → file path -} - -// maxDiffGenerations caps how many incremental diff generations we chain -// before merging diffs into a single file. Since UFFD lazy-loads memory -// anyway, we merge on every re-pause to keep exactly 1 diff file per -// snapshot — no accumulated chain, no extra restore overhead. -const maxDiffGenerations = 1 - // buildMetadata constructs the metadata map with version information. func (m *Manager) buildMetadata(envdVersion string) map[string]string { meta := map[string]string{ - "kernel_version": m.cfg.KernelVersion, - "firecracker_version": m.cfg.FirecrackerVersion, - "agent_version": m.cfg.AgentVersion, + "kernel_version": m.cfg.KernelVersion, + "vmm_version": m.cfg.VMMVersion, + "agent_version": m.cfg.AgentVersion, } if envdVersion != "" { meta["envd_version"] = envdVersion @@ -182,9 +160,9 @@ func (m *Manager) Create(ctx context.Context, sandboxID string, teamID, template diskSizeMB = 5120 // 5 GB default } - // Check if template refers to a snapshot (has snapfile + memfile + header + rootfs). + // Check if template refers to a CH snapshot (has config.json). tmplDir := layout.TemplateDir(m.cfg.WrennDir, teamID, templateID) - if _, err := os.Stat(filepath.Join(tmplDir, snapshot.SnapFileName)); err == nil { + if _, err := os.Stat(filepath.Join(tmplDir, snapshot.CHConfigFile)); err == nil { return m.createFromSnapshot(ctx, sandboxID, teamID, templateID, vcpus, memoryMB, timeoutSec, diskSizeMB) } @@ -238,7 +216,7 @@ func (m *Manager) Create(ctx context.Context, sandboxID string, teamID, template return nil, fmt.Errorf("create network: %w", err) } - // Boot VM — Firecracker gets the dm device path. + // Boot VM — CH gets the dm device path. vmCfg := vm.VMConfig{ SandboxID: sandboxID, TemplateID: id.UUIDString(templateID), @@ -252,7 +230,7 @@ func (m *Manager) Create(ctx context.Context, sandboxID string, teamID, template GuestIP: slot.GuestIP, GatewayIP: slot.TapIP, NetMask: slot.GuestNetMask, - FirecrackerBin: m.cfg.FirecrackerBin, + VMMBin: m.cfg.VMMBin, } if _, err := m.vm.Create(ctx, vmCfg); err != nil { @@ -311,6 +289,7 @@ func (m *Manager) Create(ctx context.Context, sandboxID string, teamID, template m.mu.Unlock() m.startSampler(sb) + m.startCrashWatcher(sb) slog.Info("sandbox created", "id", sandboxID, @@ -374,10 +353,6 @@ func (m *Manager) cleanup(ctx context.Context, sb *sandboxState) { if sb.baseImagePath != "" { m.loops.Release(sb.baseImagePath) } - - if sb.uffdSocketPath != "" { - os.Remove(sb.uffdSocketPath) - } } // Pause takes a snapshot of a running sandbox, then destroys all resources. @@ -390,7 +365,7 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error { } // Serialize lifecycle operations on this sandbox to prevent concurrent - // Pause/Destroy calls from corrupting Firecracker state. + // Pause/Destroy calls from corrupting VM state. sb.lifecycleMu.Lock() defer sb.lifecycleMu.Unlock() @@ -414,7 +389,7 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error { } // Stop the metrics sampler goroutine before tearing down any resources - // it reads (dm device, Firecracker PID). Without this, the sampler + // it reads (dm device, VMM PID). Without this, the sampler // leaks on every successful pause. m.stopSampler(sb) @@ -444,16 +419,16 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error { }() // ── Step 3: Inflate balloon to reclaim free guest memory ───────── - // Freed pages become zero from FC's perspective, so ProcessMemfile - // skips them → dramatically smaller memfile (e.g. 20GB → 1GB). + // Freed pages become zero in the snapshot's memory-ranges file. + // CH v52+ writes sparse snapshots natively (SEEK_DATA/SEEK_HOLE). func() { - memUsed, err := readEnvdMemUsed(sb.client) + memUsed, err := readEnvdMemUsed(ctx, sb.client) if err != nil { slog.Debug("pause: could not read guest memory, skipping balloon inflate", "id", sandboxID, "error", err) return } usedMiB := int(memUsed / (1024 * 1024)) - keepMiB := max(usedMiB*2, 256) + 128 + keepMiB := max(usedMiB*3/2, 512) inflateMiB := sb.MemoryMB - keepMiB if inflateMiB <= 0 { slog.Debug("pause: not enough free memory for balloon inflate", "id", sandboxID, "used_mib", usedMiB, "total_mib", sb.MemoryMB) @@ -478,19 +453,10 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error { } slog.Debug("pause: VM paused", "id", sandboxID, "elapsed", time.Since(pauseStart)) - // Always use Diff when we have a parent snapshot — Diff only captures - // changed pages and is much faster than Full (which dumps all memory). - // For first-time pauses (no parent) we must use Full. - snapshotType := "Full" - if sb.parent != nil { - snapshotType = "Diff" - } - // resumeOnError unpauses the VM so the sandbox stays usable when a // post-freeze step fails. If the resume itself fails, the sandbox is // frozen and unrecoverable — destroy it to avoid a zombie. resumeOnError := func() { - // Use a fresh context — the caller's ctx may already be cancelled. resumeCtx, resumeCancel := context.WithTimeout(context.Background(), 30*time.Second) defer resumeCancel() if err := m.vm.Resume(resumeCtx, sandboxID); err != nil { @@ -507,126 +473,62 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error { restoreRunning() } - // Step 2: Take VM state snapshot (snapfile + memfile). + // ── Step 5: Take CH snapshot ───────────────────────────────────── + // Snapshot to a temp dir first. If the sandbox was previously resumed, + // the old pauseDir still contains memory-ranges that CH's uffd handler + // is lazily paging from. CH refuses to overwrite existing files (EEXIST), + // so we write to a fresh dir and swap after the VM is destroyed. pauseDir := layout.PauseSnapshotDir(m.cfg.WrennDir, sandboxID) - if err := os.MkdirAll(pauseDir, 0755); err != nil { + tmpPauseDir := pauseDir + ".new" + if err := os.RemoveAll(tmpPauseDir); err != nil { + resumeOnError() + return fmt.Errorf("clean temp snapshot dir: %w", err) + } + if err := os.MkdirAll(tmpPauseDir, 0755); err != nil { resumeOnError() return fmt.Errorf("create snapshot dir: %w", err) } - rawMemPath := filepath.Join(pauseDir, "memfile.raw") - snapPath := filepath.Join(pauseDir, snapshot.SnapFileName) - snapshotStart := time.Now() - if err := m.vm.Snapshot(ctx, sandboxID, snapPath, rawMemPath, snapshotType); err != nil { - slog.Error("pause: snapshot failed", "id", sandboxID, "type", snapshotType, "elapsed", time.Since(snapshotStart), "error", err) - warnErr("snapshot dir cleanup error", sandboxID, os.RemoveAll(pauseDir)) + if err := m.vm.Snapshot(ctx, sandboxID, tmpPauseDir); err != nil { + slog.Error("pause: snapshot failed", "id", sandboxID, "elapsed", time.Since(snapshotStart), "error", err) + warnErr("snapshot dir cleanup error", sandboxID, os.RemoveAll(tmpPauseDir)) resumeOnError() return fmt.Errorf("create VM snapshot: %w", err) } - slog.Debug("pause: FC snapshot created", "id", sandboxID, "type", snapshotType, "elapsed", time.Since(snapshotStart)) + slog.Debug("pause: CH snapshot created", "id", sandboxID, "elapsed", time.Since(snapshotStart)) - // Step 3: Process the raw memfile into a compact diff + header. - buildID := uuid.New() - headerPath := filepath.Join(pauseDir, snapshot.MemHeaderName) - - processStart := time.Now() - if sb.parent != nil { - // Diff: process against parent header, producing only changed blocks. - diffPath := snapshot.MemDiffPathForBuild(pauseDir, "", buildID) - if _, err := snapshot.ProcessMemfileWithParent(rawMemPath, diffPath, headerPath, sb.parent.header, buildID); err != nil { - warnErr("snapshot dir cleanup error", sandboxID, os.RemoveAll(pauseDir)) - resumeOnError() - return fmt.Errorf("process memfile with parent: %w", err) - } - - // Copy previous generation diff files into the snapshot directory. - for prevBuildID, prevPath := range sb.parent.diffPaths { - dstPath := snapshot.MemDiffPathForBuild(pauseDir, "", uuid.MustParse(prevBuildID)) - if prevPath != dstPath { - if err := copyFile(prevPath, dstPath); err != nil { - warnErr("snapshot dir cleanup error", sandboxID, os.RemoveAll(pauseDir)) - resumeOnError() - return fmt.Errorf("copy parent diff file: %w", err) - } - } - } - - // If the generation cap is reached, merge all diff files into a - // single file to collapse the chain. This is a file-level operation - // (no Firecracker involvement) so it's fast and reliable. - generation := sb.parent.header.Metadata.Generation + 1 - if generation >= maxDiffGenerations { - slog.Debug("pause: merging diff generations", "id", sandboxID, "generation", generation) - - // Load the header we just wrote (it references all generations). - headerData, err := os.ReadFile(headerPath) - if err != nil { - warnErr("snapshot dir cleanup error", sandboxID, os.RemoveAll(pauseDir)) - resumeOnError() - return fmt.Errorf("read header for merge: %w", err) - } - currentHeader, err := snapshot.Deserialize(headerData) - if err != nil { - warnErr("snapshot dir cleanup error", sandboxID, os.RemoveAll(pauseDir)) - resumeOnError() - return fmt.Errorf("deserialize header for merge: %w", err) - } - - // Locate all diff files referenced by the header. - diffFiles, err := snapshot.ListDiffFiles(pauseDir, "", currentHeader) - if err != nil { - warnErr("snapshot dir cleanup error", sandboxID, os.RemoveAll(pauseDir)) - resumeOnError() - return fmt.Errorf("list diff files for merge: %w", err) - } - - // Merge into a single new diff file. - mergedPath := snapshot.MemDiffPath(pauseDir, "") - if _, err := snapshot.MergeDiffs(currentHeader, diffFiles, mergedPath, headerPath); err != nil { - warnErr("snapshot dir cleanup error", sandboxID, os.RemoveAll(pauseDir)) - resumeOnError() - return fmt.Errorf("merge diff files: %w", err) - } - - // Remove the old per-generation diff files. - removeStaleMemDiffs(pauseDir) - slog.Debug("pause: diff merge complete", "id", sandboxID) - } - } else { - // Full: first pause — no parent to diff against. - diffPath := snapshot.MemDiffPath(pauseDir, "") - if _, err := snapshot.ProcessMemfile(rawMemPath, diffPath, headerPath, buildID); err != nil { - warnErr("snapshot dir cleanup error", sandboxID, os.RemoveAll(pauseDir)) - resumeOnError() - return fmt.Errorf("process memfile: %w", err) - } - } - slog.Debug("pause: memfile processed", "id", sandboxID, "type", snapshotType, "elapsed", time.Since(processStart)) - - // Remove the raw memfile — we only keep the compact diff(s). - os.Remove(rawMemPath) - - // Step 4: Destroy the VM first so Firecracker releases the dm device. + // ── Step 6: Destroy the VM so CH releases the dm device ────────── if err := m.vm.Destroy(ctx, sb.ID); err != nil { slog.Warn("vm destroy error during pause", "id", sb.ID, "error", err) } - // Step 5: Now that FC is gone, safely remove the dm-snapshot and save the CoW. + // CH process is dead — uffd handler no longer reads old memory-ranges. + // Replace old snapshot dir with new one. + if err := os.RemoveAll(pauseDir); err != nil { + slog.Warn("pause: failed to remove old snapshot dir", "id", sandboxID, "error", err) + } + if err := os.Rename(tmpPauseDir, pauseDir); err != nil { + warnErr("network cleanup error during pause", sandboxID, network.RemoveNetwork(sb.slot)) + m.slots.Release(sb.SlotIndex) + if sb.dmDevice != nil { + warnErr("dm-snapshot remove error during pause", sandboxID, devicemapper.RemoveSnapshot(context.Background(), sb.dmDevice)) + os.Remove(sb.dmDevice.CowPath) + } + if sb.baseImagePath != "" { + m.loops.Release(sb.baseImagePath) + } + m.mu.Lock() + delete(m.boxes, sandboxID) + m.mu.Unlock() + return fmt.Errorf("rename snapshot dir: %w", err) + } + + // ── Step 7: Remove dm-snapshot and save CoW ────────────────────── if sb.dmDevice != nil { if err := devicemapper.RemoveSnapshot(ctx, sb.dmDevice); err != nil { - // Hard error: if the dm device isn't removed, the CoW file is still - // in use and we can't safely move it. The VM is already destroyed so - // the sandbox is unrecoverable — clean up remaining resources. - // Note: we intentionally skip m.loops.Release here because the stale - // dm device still references the origin loop device. Detaching it now - // would corrupt the dm device. CleanupStaleDevices handles this on - // next agent startup. warnErr("network cleanup error during pause", sandboxID, network.RemoveNetwork(sb.slot)) m.slots.Release(sb.SlotIndex) - if sb.uffdSocketPath != "" { - os.Remove(sb.uffdSocketPath) - } warnErr("snapshot dir cleanup error", sandboxID, os.RemoveAll(pauseDir)) m.mu.Lock() delete(m.boxes, sandboxID) @@ -634,40 +536,32 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error { return fmt.Errorf("remove dm-snapshot: %w", err) } - // Move (not copy) the CoW file into the snapshot directory. snapshotCow := snapshot.CowPath(pauseDir, "") if err := os.Rename(sb.dmDevice.CowPath, snapshotCow); err != nil { warnErr("snapshot dir cleanup error", sandboxID, os.RemoveAll(pauseDir)) - // VM and dm-snapshot are already gone — clean up remaining resources. warnErr("network cleanup error during pause", sandboxID, network.RemoveNetwork(sb.slot)) m.slots.Release(sb.SlotIndex) if sb.baseImagePath != "" { m.loops.Release(sb.baseImagePath) } - if sb.uffdSocketPath != "" { - os.Remove(sb.uffdSocketPath) - } m.mu.Lock() delete(m.boxes, sandboxID) m.mu.Unlock() return fmt.Errorf("move cow file: %w", err) } - // Record which base template this CoW was built against. if err := snapshot.WriteMeta(pauseDir, "", &snapshot.RootfsMeta{ BaseTemplate: sb.baseImagePath, - TemplateID: uuid.UUID(sb.TemplateID).String(), + TemplateID: id.UUIDString(pgtype.UUID{Bytes: sb.TemplateID, Valid: true}), + VCPUs: sb.VCPUs, + MemoryMB: sb.MemoryMB, }); err != nil { warnErr("snapshot dir cleanup error", sandboxID, os.RemoveAll(pauseDir)) - // VM and dm-snapshot are already gone — clean up remaining resources. warnErr("network cleanup error during pause", sandboxID, network.RemoveNetwork(sb.slot)) m.slots.Release(sb.SlotIndex) if sb.baseImagePath != "" { m.loops.Release(sb.baseImagePath) } - if sb.uffdSocketPath != "" { - os.Remove(sb.uffdSocketPath) - } m.mu.Lock() delete(m.boxes, sandboxID) m.mu.Unlock() @@ -675,7 +569,7 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error { } } - // Step 6: Clean up remaining resources (network, loop device, uffd socket). + // ── Step 8: Clean up remaining resources ───────────────────────── if err := network.RemoveNetwork(sb.slot); err != nil { slog.Warn("network cleanup error during pause", "id", sb.ID, "error", err) } @@ -683,54 +577,29 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error { if sb.baseImagePath != "" { m.loops.Release(sb.baseImagePath) } - if sb.uffdSocketPath != "" { - os.Remove(sb.uffdSocketPath) - } m.mu.Lock() delete(m.boxes, sandboxID) m.mu.Unlock() - slog.Info("sandbox paused", "id", sandboxID, "snapshot_type", snapshotType, "total_elapsed", time.Since(pauseStart)) + slog.Info("sandbox paused", "id", sandboxID, "total_elapsed", time.Since(pauseStart)) return nil } -// Resume restores a paused sandbox from its snapshot using UFFD for -// lazy memory loading. The sandbox gets a new network slot. -// Optional defaultUser and defaultEnv are applied via a single PostInit -// call so that template defaults are set without an extra round-trip. +// Resume restores a paused sandbox from its CH snapshot. +// CH handles memory restore internally (with on-demand paging). +// The sandbox gets a new network slot. +// Optional defaultUser and defaultEnv are applied via PostInit with +// sandbox_id and template_id so envd picks up the new sandbox's metadata. func (m *Manager) Resume(ctx context.Context, sandboxID string, timeoutSec int, kernelVersion string, defaultUser string, defaultEnv map[string]string) (*models.Sandbox, error) { pauseDir := layout.PauseSnapshotDir(m.cfg.WrennDir, sandboxID) if _, err := os.Stat(pauseDir); err != nil { return nil, fmt.Errorf("no snapshot found for sandbox %s", sandboxID) } - // Read the header to set up the UFFD memory source. - headerData, err := os.ReadFile(filepath.Join(pauseDir, snapshot.MemHeaderName)) - if err != nil { - return nil, fmt.Errorf("read header: %w", err) - } - - header, err := snapshot.Deserialize(headerData) - if err != nil { - return nil, fmt.Errorf("deserialize header: %w", err) - } - - // Build diff file map — supports both single-generation and multi-generation. - diffPaths, err := snapshot.ListDiffFiles(pauseDir, "", header) - if err != nil { - return nil, fmt.Errorf("list diff files: %w", err) - } - - source, err := uffd.NewDiffFileSource(header, diffPaths) - if err != nil { - return nil, fmt.Errorf("create memory source: %w", err) - } - // Read rootfs metadata to find the base template image. meta, err := snapshot.ReadMeta(pauseDir, "") if err != nil { - source.Close() return nil, fmt.Errorf("read rootfs meta: %w", err) } @@ -738,13 +607,11 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string, timeoutSec int, baseImagePath := meta.BaseTemplate originLoop, err := m.loops.Acquire(baseImagePath) if err != nil { - source.Close() return nil, fmt.Errorf("acquire loop device: %w", err) } originSize, err := devicemapper.OriginSizeBytes(originLoop) if err != nil { - source.Close() m.loops.Release(baseImagePath) return nil, fmt.Errorf("get origin size: %w", err) } @@ -753,13 +620,10 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string, timeoutSec int, savedCow := snapshot.CowPath(pauseDir, "") cowPath := filepath.Join(layout.SandboxesDir(m.cfg.WrennDir), fmt.Sprintf("%s.cow", sandboxID)) if err := os.Rename(savedCow, cowPath); err != nil { - source.Close() m.loops.Release(baseImagePath) return nil, fmt.Errorf("move cow file: %w", err) } - // rollbackCow attempts to move the CoW file back to the snapshot dir. - // Best-effort — logs a warning if it fails. rollbackCow := func() { if err := os.Rename(cowPath, savedCow); err != nil { slog.Warn("failed to rollback cow file", "src", cowPath, "dst", savedCow, "error", err) @@ -770,7 +634,6 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string, timeoutSec int, dmName := "wrenn-" + sandboxID dmDev, err := devicemapper.RestoreSnapshot(ctx, dmName, originLoop, cowPath, originSize) if err != nil { - source.Close() m.loops.Release(baseImagePath) rollbackCow() return nil, fmt.Errorf("restore dm-snapshot: %w", err) @@ -779,7 +642,6 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string, timeoutSec int, // Allocate network slot. slotIdx, err := m.slots.Allocate() if err != nil { - source.Close() warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev)) rollbackCow() m.loops.Release(baseImagePath) @@ -788,7 +650,6 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string, timeoutSec int, slot := network.NewSlot(slotIdx) if err := network.CreateNetwork(slot); err != nil { - source.Close() m.slots.Release(slotIdx) warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev)) rollbackCow() @@ -796,41 +657,22 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string, timeoutSec int, return nil, fmt.Errorf("create network: %w", err) } - // Start UFFD server. - uffdSocketPath := filepath.Join(layout.SandboxesDir(m.cfg.WrennDir), fmt.Sprintf("%s-uffd.sock", sandboxID)) - os.Remove(uffdSocketPath) // Clean stale socket. - uffdServer := uffd.NewServer(uffdSocketPath, source) - if err := uffdServer.Start(ctx); err != nil { - source.Close() - warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot)) - m.slots.Release(slotIdx) - warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev)) - rollbackCow() - m.loops.Release(baseImagePath) - return nil, fmt.Errorf("start uffd server: %w", err) - } - - // Restore VM from snapshot. + // Restore VM from CH snapshot. vmCfg := vm.VMConfig{ SandboxID: sandboxID, TemplateID: meta.TemplateID, KernelPath: m.resolveKernelPath(kernelVersion), RootfsPath: dmDev.DevicePath, - VCPUs: 1, // Placeholder; overridden by snapshot. - MemoryMB: int(header.Metadata.Size / (1024 * 1024)), // Placeholder; overridden by snapshot. NetworkNamespace: slot.NamespaceID, TapDevice: slot.TapName, TapMAC: slot.TapMAC, GuestIP: slot.GuestIP, GatewayIP: slot.TapIP, NetMask: slot.GuestNetMask, - FirecrackerBin: m.cfg.FirecrackerBin, + VMMBin: m.cfg.VMMBin, } - resumeSnapPath := filepath.Join(pauseDir, snapshot.SnapFileName) - if _, err := m.vm.CreateFromSnapshot(ctx, vmCfg, resumeSnapPath, uffdSocketPath); err != nil { - warnErr("uffd server stop error", sandboxID, uffdServer.Stop()) - source.Close() + if _, err := m.vm.CreateFromSnapshot(ctx, vmCfg, pauseDir); err != nil { warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot)) m.slots.Release(slotIdx) warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev)) @@ -839,19 +681,12 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string, timeoutSec int, return nil, fmt.Errorf("restore VM from snapshot: %w", err) } - // Start prefetching all guest memory pages in the background. - // This runs concurrently with envd startup and eliminates on-demand - // page fault latency for subsequent RPC calls. - uffdServer.Prefetch() - // Wait for envd to be ready. client := envdclient.New(slot.HostIP.String()) waitCtx, waitCancel := context.WithTimeout(ctx, m.cfg.EnvdTimeout) if err := client.WaitUntilReady(waitCtx); err != nil { waitCancel() - warnErr("uffd server stop error", sandboxID, uffdServer.Stop()) - source.Close() warnErr("vm destroy error", sandboxID, m.vm.Destroy(context.Background(), sandboxID)) warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot)) m.slots.Release(slotIdx) @@ -862,31 +697,70 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string, timeoutSec int, } waitCancel() - // PostInit gets its own timeout — WaitUntilReady may have consumed most - // of EnvdTimeout, starving PostInit of time for RestoreAfterSnapshot. - initCtx, initCancel := context.WithTimeout(ctx, m.cfg.EnvdTimeout) - defer initCancel() + // PostInit with sandbox_id and template_id so envd sets metadata env vars. + // Fire-and-forget: post-init is non-critical (metadata/env vars), and + // blocking here widens the window for the CP monitor to race against us. + go func() { + initCtx, initCancel := context.WithTimeout(context.Background(), 10*time.Second) + defer initCancel() + if err := client.PostInitWithDefaults(initCtx, defaultUser, defaultEnv, sandboxID, meta.TemplateID); err != nil { + slog.Warn("post-init failed after resume, metadata may be stale", "sandbox", sandboxID, "error", err) + } + }() - if err := client.PostInitWithDefaults(initCtx, defaultUser, defaultEnv); err != nil { - slog.Warn("post-init failed after resume, metadata files may be stale", "sandbox", sandboxID, "error", err) - } - - // Deflate balloon — the snapshot was taken with an inflated balloon to - // reduce memfile size, so restore the guest's full memory allocation. + // Deflate balloon — the snapshot was taken with an inflated balloon. if err := m.vm.UpdateBalloon(ctx, sandboxID, 0); err != nil { slog.Debug("resume: balloon deflate failed (non-fatal)", "id", sandboxID, "error", err) } + // Wait for balloon deflation to settle. With OnDemand UFFD restore, + // the guest is under severe memory pressure while the balloon is still + // inflated. If the sampler or memory reclaimer runs during this window, + // the resulting page fault storm can make the guest kernel unresponsive. + if meta.MemoryMB > 0 { + threshold := int64(float64(meta.MemoryMB) * 0.80 * 1024 * 1024) + settleCtx, settleCancel := context.WithTimeout(ctx, 10*time.Second) + ticker := time.NewTicker(500 * time.Millisecond) + settled := false + for !settled { + select { + case <-settleCtx.Done(): + slog.Warn("resume: balloon deflation did not settle in time", "id", sandboxID) + settled = true + case <-ticker.C: + used, err := readEnvdMemUsed(settleCtx, client) + if err != nil { + continue + } + if used < threshold { + slog.Info("resume: balloon deflation settled", "id", sandboxID, "used_mib", used/(1024*1024)) + settled = true + } + } + } + ticker.Stop() + settleCancel() + } + // Fetch envd version (best-effort). envdVersion, _ := client.FetchVersion(ctx) + vcpus := meta.VCPUs + if vcpus <= 0 { + vcpus = 1 + } + memoryMB := meta.MemoryMB + if memoryMB <= 0 { + memoryMB = 512 + } + now := time.Now() sb := &sandboxState{ Sandbox: models.Sandbox{ ID: sandboxID, Status: models.StatusRunning, - VCPUs: vmCfg.VCPUs, - MemoryMB: vmCfg.MemoryMB, + VCPUs: vcpus, + MemoryMB: memoryMB, TimeoutSec: timeoutSec, SlotIndex: slotIdx, HostIP: slot.HostIP, @@ -895,17 +769,11 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string, timeoutSec int, LastActiveAt: now, Metadata: m.buildMetadata(envdVersion), }, - slot: slot, - client: client, - connTracker: &ConnTracker{}, - uffdSocketPath: uffdSocketPath, - dmDevice: dmDev, - baseImagePath: baseImagePath, - // Preserve parent snapshot info so re-pause can use Diff snapshots. - parent: &snapshotParent{ - header: header, - diffPaths: diffPaths, - }, + slot: slot, + client: client, + connTracker: &ConnTracker{}, + dmDevice: dmDev, + baseImagePath: baseImagePath, } m.mu.Lock() @@ -913,16 +781,13 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string, timeoutSec int, m.mu.Unlock() m.startSampler(sb) - - // Don't delete snapshot dir — diff files are needed for re-pause. - // The CoW file was already moved out. The dir will be cleaned up - // on destroy or overwritten on re-pause. + m.startCrashWatcher(sb) slog.Info("sandbox resumed from snapshot", "id", sandboxID, "host_ip", slot.HostIP.String(), "dm_device", dmDev.DevicePath, - "generation", header.Metadata.Generation, + "vcpus", vcpus, ) return &sb.Sandbox, nil @@ -955,42 +820,19 @@ func (m *Manager) CreateSnapshot(ctx context.Context, sandboxID string, teamID, return 0, fmt.Errorf("create template dir: %w", err) } - // Copy VM snapshot file and memory header. - srcDir := pauseDir - - for _, fname := range []string{snapshot.SnapFileName, snapshot.MemHeaderName} { - src := filepath.Join(srcDir, fname) + // Copy CH snapshot files (config.json, memory-ranges, state.json). + for _, fname := range []string{snapshot.CHConfigFile, snapshot.CHMemRangesFile, snapshot.CHStateFile} { + src := filepath.Join(pauseDir, fname) dst := filepath.Join(dstDir, fname) + if _, err := os.Stat(src); err != nil { + continue // some files may not exist + } if err := copyFile(src, dst); err != nil { warnErr("template dir cleanup error", dstDir, os.RemoveAll(dstDir)) return 0, fmt.Errorf("copy %s: %w", fname, err) } } - // Copy all memory diff files referenced by the header (supports multi-generation). - headerData, err := os.ReadFile(filepath.Join(srcDir, snapshot.MemHeaderName)) - if err != nil { - warnErr("template dir cleanup error", dstDir, os.RemoveAll(dstDir)) - return 0, fmt.Errorf("read header for template: %w", err) - } - srcHeader, err := snapshot.Deserialize(headerData) - if err != nil { - warnErr("template dir cleanup error", dstDir, os.RemoveAll(dstDir)) - return 0, fmt.Errorf("deserialize header for template: %w", err) - } - srcDiffPaths, err := snapshot.ListDiffFiles(pauseDir, "", srcHeader) - if err != nil { - warnErr("template dir cleanup error", dstDir, os.RemoveAll(dstDir)) - return 0, fmt.Errorf("list diff files for template: %w", err) - } - for _, srcPath := range srcDiffPaths { - dstPath := filepath.Join(dstDir, filepath.Base(srcPath)) - if err := copyFile(srcPath, dstPath); err != nil { - warnErr("template dir cleanup error", dstDir, os.RemoveAll(dstDir)) - return 0, fmt.Errorf("copy diff file %s: %w", filepath.Base(srcPath), err) - } - } - // Flatten rootfs: temporarily set up dm device from base + CoW, dd to new image. meta, err := snapshot.ReadMeta(pauseDir, "") if err != nil { @@ -1088,10 +930,6 @@ func (m *Manager) FlattenRootfs(ctx context.Context, sandboxID string, teamID, t } m.slots.Release(sb.SlotIndex) - if sb.uffdSocketPath != "" { - os.Remove(sb.uffdSocketPath) - } - // Create template directory and flatten the dm-snapshot. flattenDstDir := layout.TemplateDir(m.cfg.WrennDir, teamID, templateID) if err := os.MkdirAll(flattenDstDir, 0755); err != nil { @@ -1167,49 +1005,22 @@ func (m *Manager) DeleteSnapshot(teamID, templateID pgtype.UUID) error { return os.RemoveAll(layout.TemplateDir(m.cfg.WrennDir, teamID, templateID)) } -// createFromSnapshot creates a new sandbox by restoring from a snapshot template -// in ImagesDir/{snapshotName}/. Uses UFFD for lazy memory loading. +// createFromSnapshot creates a new sandbox by restoring from a snapshot template. +// CH handles memory restore internally (with on-demand paging). // The template's rootfs.ext4 is a flattened standalone image — we create a // dm-snapshot on top of it just like a normal Create. func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID string, teamID, templateID pgtype.UUID, vcpus, _, timeoutSec, diskSizeMB int) (*models.Sandbox, error) { tmplDir := layout.TemplateDir(m.cfg.WrennDir, teamID, templateID) - // Read the header. - headerData, err := os.ReadFile(filepath.Join(tmplDir, snapshot.MemHeaderName)) - if err != nil { - return nil, fmt.Errorf("read snapshot header: %w", err) - } - - header, err := snapshot.Deserialize(headerData) - if err != nil { - return nil, fmt.Errorf("deserialize header: %w", err) - } - - // Snapshot determines memory size. - memoryMB := int(header.Metadata.Size / (1024 * 1024)) - - // Build diff file map — supports multi-generation templates. - diffPaths, err := snapshot.ListDiffFiles(tmplDir, "", header) - if err != nil { - return nil, fmt.Errorf("list diff files: %w", err) - } - - source, err := uffd.NewDiffFileSource(header, diffPaths) - if err != nil { - return nil, fmt.Errorf("create memory source: %w", err) - } - // Set up dm-snapshot on the template's flattened rootfs. baseRootfs := filepath.Join(tmplDir, snapshot.RootfsFileName) originLoop, err := m.loops.Acquire(baseRootfs) if err != nil { - source.Close() return nil, fmt.Errorf("acquire loop device: %w", err) } originSize, err := devicemapper.OriginSizeBytes(originLoop) if err != nil { - source.Close() m.loops.Release(baseRootfs) return nil, fmt.Errorf("get origin size: %w", err) } @@ -1219,7 +1030,6 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID string, team cowSize := max(int64(diskSizeMB)*1024*1024, originSize) dmDev, err := devicemapper.CreateSnapshot(dmName, originLoop, cowPath, originSize, cowSize) if err != nil { - source.Close() m.loops.Release(baseRootfs) return nil, fmt.Errorf("create dm-snapshot: %w", err) } @@ -1227,7 +1037,6 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID string, team // Allocate network. slotIdx, err := m.slots.Allocate() if err != nil { - source.Close() warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev)) os.Remove(cowPath) m.loops.Release(baseRootfs) @@ -1236,7 +1045,6 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID string, team slot := network.NewSlot(slotIdx) if err := network.CreateNetwork(slot); err != nil { - source.Close() m.slots.Release(slotIdx) warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev)) os.Remove(cowPath) @@ -1244,41 +1052,23 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID string, team return nil, fmt.Errorf("create network: %w", err) } - // Start UFFD server. - uffdSocketPath := filepath.Join(layout.SandboxesDir(m.cfg.WrennDir), fmt.Sprintf("%s-uffd.sock", sandboxID)) - os.Remove(uffdSocketPath) - uffdServer := uffd.NewServer(uffdSocketPath, source) - if err := uffdServer.Start(ctx); err != nil { - source.Close() - warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot)) - m.slots.Release(slotIdx) - warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev)) - os.Remove(cowPath) - m.loops.Release(baseRootfs) - return nil, fmt.Errorf("start uffd server: %w", err) - } - - // Restore VM. + // Restore VM from CH snapshot. vmCfg := vm.VMConfig{ SandboxID: sandboxID, TemplateID: id.UUIDString(templateID), KernelPath: m.cfg.KernelPath, RootfsPath: dmDev.DevicePath, VCPUs: vcpus, - MemoryMB: memoryMB, NetworkNamespace: slot.NamespaceID, TapDevice: slot.TapName, TapMAC: slot.TapMAC, GuestIP: slot.GuestIP, GatewayIP: slot.TapIP, NetMask: slot.GuestNetMask, - FirecrackerBin: m.cfg.FirecrackerBin, + VMMBin: m.cfg.VMMBin, } - snapPath := filepath.Join(tmplDir, snapshot.SnapFileName) - if _, err := m.vm.CreateFromSnapshot(ctx, vmCfg, snapPath, uffdSocketPath); err != nil { - warnErr("uffd server stop error", sandboxID, uffdServer.Stop()) - source.Close() + if _, err := m.vm.CreateFromSnapshot(ctx, vmCfg, tmplDir); err != nil { warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot)) m.slots.Release(slotIdx) warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev)) @@ -1287,17 +1077,12 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID string, team return nil, fmt.Errorf("restore VM from snapshot: %w", err) } - // Start prefetching all guest memory pages in the background. - uffdServer.Prefetch() - // Wait for envd. client := envdclient.New(slot.HostIP.String()) waitCtx, waitCancel := context.WithTimeout(ctx, m.cfg.EnvdTimeout) if err := client.WaitUntilReady(waitCtx); err != nil { waitCancel() - warnErr("uffd server stop error", sandboxID, uffdServer.Stop()) - source.Close() warnErr("vm destroy error", sandboxID, m.vm.Destroy(context.Background(), sandboxID)) warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot)) m.slots.Release(slotIdx) @@ -1308,13 +1093,12 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID string, team } waitCancel() - // PostInit gets its own timeout — WaitUntilReady may have consumed most - // of EnvdTimeout, starving PostInit of time for RestoreAfterSnapshot. + // PostInit with sandbox_id and template_id so envd sets metadata. initCtx, initCancel := context.WithTimeout(ctx, m.cfg.EnvdTimeout) defer initCancel() - if err := client.PostInit(initCtx); err != nil { - slog.Warn("post-init failed after template restore, metadata files may be stale", "sandbox", sandboxID, "error", err) + if err := client.PostInitWithDefaults(initCtx, "", nil, sandboxID, id.UUIDString(templateID)); err != nil { + slog.Warn("post-init failed after template restore, metadata may be stale", "sandbox", sandboxID, "error", err) } // Deflate balloon — template snapshot was taken with an inflated balloon. @@ -1333,7 +1117,6 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID string, team TemplateTeamID: teamID.Bytes, TemplateID: templateID.Bytes, VCPUs: vcpus, - MemoryMB: memoryMB, TimeoutSec: timeoutSec, SlotIndex: slotIdx, HostIP: slot.HostIP, @@ -1342,17 +1125,11 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID string, team LastActiveAt: now, Metadata: m.buildMetadata(envdVersion), }, - slot: slot, - client: client, - connTracker: &ConnTracker{}, - uffdSocketPath: uffdSocketPath, - dmDevice: dmDev, - baseImagePath: baseRootfs, - // Template-spawned sandboxes also get diff re-pause support. - parent: &snapshotParent{ - header: header, - diffPaths: diffPaths, - }, + slot: slot, + client: client, + connTracker: &ConnTracker{}, + dmDevice: dmDev, + baseImagePath: baseRootfs, } m.mu.Lock() @@ -1360,6 +1137,7 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID string, team m.mu.Unlock() m.startSampler(sb) + m.startCrashWatcher(sb) slog.Info("sandbox created from snapshot", "id", sandboxID, @@ -1455,7 +1233,7 @@ func (m *Manager) SetDefaults(ctx context.Context, sandboxID, defaultUser string if sb.Status != models.StatusRunning { return fmt.Errorf("sandbox %s is not running (status: %s)", sandboxID, sb.Status) } - return sb.client.PostInitWithDefaults(ctx, defaultUser, defaultEnv) + return sb.client.PostInitWithDefaults(ctx, defaultUser, defaultEnv, "", "") } // PtyAttach starts a new PTY process or reconnects to an existing one. @@ -1696,6 +1474,11 @@ func (m *Manager) reapExpired(_ context.Context) { slog.Warn("TTL auto-pause failed, destroying sandbox", "id", id, "error", err) if destroyErr := m.Destroy(context.Background(), id); destroyErr != nil { slog.Warn("TTL destroy after failed pause also failed", "id", id, "error", destroyErr) + } else if m.eventSender != nil { + m.eventSender.SendAsync(LifecycleEvent{ + Event: "sandbox.stopped", + SandboxID: id, + }) } continue } @@ -1755,23 +1538,6 @@ func (m *Manager) PauseAll(ctx context.Context) { } } -// removeStaleMemDiffs removes memfile.{uuid} diff files from a snapshot -// directory. Called before writing a Full snapshot to prevent orphaned diffs -// from accumulating across generation resets. -func removeStaleMemDiffs(dir string) { - entries, err := os.ReadDir(dir) - if err != nil { - return - } - for _, e := range entries { - name := e.Name() - // Match "memfile.{uuid}" but not "memfile", "memfile.header", or "memfile.raw". - if strings.HasPrefix(name, "memfile.") && name != snapshot.MemHeaderName && name != "memfile.raw" { - os.Remove(filepath.Join(dir, name)) - } - } -} - // warnErr logs a warning if err is non-nil. Used for best-effort cleanup // in error paths where the primary error has already been captured. func warnErr(msg string, id string, err error) { @@ -1780,8 +1546,84 @@ func warnErr(msg string, id string, err error) { } } -// startSampler resolves the Firecracker PID and starts a background goroutine -// that samples CPU/mem/disk at 500ms intervals into the ring buffer. +// startCrashWatcher monitors the VM process for unexpected exits. +// If the process exits while the sandbox is still in m.boxes (i.e. not a +// deliberate Destroy), the sandbox is cleaned up and a sandbox.error event +// is pushed to the control plane. +func (m *Manager) startCrashWatcher(sb *sandboxState) { + v, ok := m.vm.Get(sb.ID) + if !ok { + return + } + go func() { + select { + case <-v.Exited(): + case <-m.stopCh: + return + } + + // Check if this was a deliberate Destroy/Pause (sandbox already removed + // from boxes, or Pause owns the cleanup). + m.mu.Lock() + _, stillAlive := m.boxes[sb.ID] + if stillAlive && sb.Status == models.StatusPausing { + stillAlive = false + } + if stillAlive { + delete(m.boxes, sb.ID) + } + m.mu.Unlock() + + if !stillAlive { + return + } + + slog.Error("VM process crashed, cleaning up", "id", sb.ID) + + sb.lifecycleMu.Lock() + m.cleanupAfterCrash(sb) + sb.lifecycleMu.Unlock() + + if m.onDestroy != nil { + m.onDestroy(sb.ID) + } + + if m.eventSender != nil { + m.eventSender.SendAsync(LifecycleEvent{ + Event: "sandbox.error", + SandboxID: sb.ID, + }) + } + }() +} + +// cleanupAfterCrash tears down sandbox resources after a VM crash. +// The VM process is already dead so we skip vm.Destroy and just clean up +// network, device-mapper, and loop devices. +func (m *Manager) cleanupAfterCrash(sb *sandboxState) { + m.stopSampler(sb) + + // Remove the VM from the vm.Manager's map (process is already dead). + _ = m.vm.Destroy(context.Background(), sb.ID) + + if err := network.RemoveNetwork(sb.slot); err != nil { + slog.Warn("crash cleanup: network error", "id", sb.ID, "error", err) + } + m.slots.Release(sb.SlotIndex) + + if sb.dmDevice != nil { + if err := devicemapper.RemoveSnapshot(context.Background(), sb.dmDevice); err != nil { + slog.Warn("crash cleanup: dm-snapshot error", "id", sb.ID, "error", err) + } + os.Remove(sb.dmDevice.CowPath) + } + if sb.baseImagePath != "" { + m.loops.Release(sb.baseImagePath) + } +} + +// startSampler resolves the VMM PID and starts a background goroutine +// that samples CPU/mem/disk at 1s intervals into the ring buffer. // Must be called after the sandbox is registered in m.boxes. func (m *Manager) startSampler(sb *sandboxState) { v, ok := m.vm.Get(sb.ID) @@ -1791,13 +1633,11 @@ func (m *Manager) startSampler(sb *sandboxState) { } // v.PID() is the cmd.Process.Pid of the "unshare -m -- bash -c script" - // invocation. Because unshare(2) modifies the current process's namespace - // before exec-replacing itself with bash, and bash exec-replaces itself - // with ip-netns-exec, which exec-replaces itself with firecracker, the - // entire exec chain occupies the same PID. v.PID() IS the Firecracker PID. - fcPID := v.PID() + // invocation. The exec chain (unshare → bash → ip netns exec → cloud-hypervisor) + // occupies the same PID. v.PID() IS the VMM PID. + vmmPID := v.PID() - sb.fcPID = fcPID + sb.vmmPID = vmmPID sb.ring = newMetricsRing() ctx, cancel := context.WithCancel(context.Background()) @@ -1806,17 +1646,17 @@ func (m *Manager) startSampler(sb *sandboxState) { // Read initial CPU counters for delta calculation. // Passed to goroutine as local state — no shared mutation. - initialCPU, err := readCPUStat(fcPID) + initialCPU, err := readCPUStat(vmmPID) if err != nil { slog.Warn("metrics: could not read initial CPU stat", "id", sb.ID, "error", err) } - go m.samplerLoop(ctx, sb, fcPID, sb.VCPUs, initialCPU) + go m.samplerLoop(ctx, sb, vmmPID, sb.VCPUs, initialCPU) } // samplerLoop samples metrics at 1s intervals. // lastCPU is goroutine-local to avoid shared-state races. -func (m *Manager) samplerLoop(ctx context.Context, sb *sandboxState, fcPID, vcpus int, lastCPU cpuStat) { +func (m *Manager) samplerLoop(ctx context.Context, sb *sandboxState, vmmPID, vcpus int, lastCPU cpuStat) { defer close(sb.samplerDone) ticker := time.NewTicker(1 * time.Second) @@ -1836,7 +1676,7 @@ func (m *Manager) samplerLoop(ctx context.Context, sb *sandboxState, fcPID, vcpu // CPU: delta jiffies / (elapsed * CLK_TCK * vcpus) * 100 var cpuPct float64 - cur, err := readCPUStat(fcPID) + cur, err := readCPUStat(vmmPID) if err == nil { if cpuInitialized && elapsed > 0 && vcpus > 0 { deltaJiffies := float64((cur.utime + cur.stime) - (lastCPU.utime + lastCPU.stime)) @@ -1853,10 +1693,10 @@ func (m *Manager) samplerLoop(ctx context.Context, sb *sandboxState, fcPID, vcpu } // Memory: guest-reported used memory from envd /metrics. - // VmRSS of the Firecracker process includes guest page cache - // and never decreases, so we use the guest's own view which - // reports total - available (actual process memory). - memBytes, _ := readEnvdMemUsed(sb.client) + // VmRSS of the VMM process includes guest page cache and never + // decreases, so we use the guest's own view which reports + // total - available (actual process memory). + memBytes, _ := readEnvdMemUsed(ctx, sb.client) // Disk: allocated bytes of the CoW sparse file. var diskBytes int64 diff --git a/internal/sandbox/proc.go b/internal/sandbox/proc.go index ede22f0..276a13f 100644 --- a/internal/sandbox/proc.go +++ b/internal/sandbox/proc.go @@ -1,9 +1,11 @@ package sandbox import ( + "context" "encoding/json" "fmt" "io" + "net/http" "os" "strconv" "strings" @@ -50,10 +52,15 @@ func readCPUStat(pid int) (cpuStat, error) { // readEnvdMemUsed fetches mem_used from envd's /metrics endpoint. Returns // guest-side total - MemAvailable (actual process memory, excluding reclaimable -// page cache). VmRSS of the Firecracker process includes guest page cache and +// page cache). VmRSS of the VMM process includes guest page cache and // never decreases, so this is the accurate metric for dashboard display. -func readEnvdMemUsed(client *envdclient.Client) (int64, error) { - resp, err := client.HTTPClient().Get(client.BaseURL() + "/metrics") +func readEnvdMemUsed(ctx context.Context, client *envdclient.Client) (int64, error) { + req, err := http.NewRequestWithContext(ctx, http.MethodGet, client.BaseURL()+"/metrics", nil) + if err != nil { + return 0, fmt.Errorf("build metrics request: %w", err) + } + + resp, err := client.HTTPClient().Do(req) if err != nil { return 0, fmt.Errorf("fetch envd metrics: %w", err) } diff --git a/internal/snapshot/header.go b/internal/snapshot/header.go deleted file mode 100644 index e679529..0000000 --- a/internal/snapshot/header.go +++ /dev/null @@ -1,221 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// Modifications by M/S Omukk - -// Package snapshot implements snapshot storage, header-based memory mapping, -// and memory file processing for Firecracker VM snapshots. -// -// The header system implements a generational copy-on-write memory mapping. -// Each snapshot generation stores only the blocks that changed since the -// previous generation. A Header contains a sorted list of BuildMap entries -// that together cover the entire memory address space, with each entry -// pointing to a specific generation's diff file. -package snapshot - -import ( - "bytes" - "context" - "encoding/binary" - "errors" - "fmt" - "io" - - "github.com/google/uuid" -) - -const metadataVersion = 1 - -// Metadata is the fixed-size header prefix describing the snapshot memory layout. -// Binary layout (little-endian, 64 bytes total): -// -// Version uint64 (8 bytes) -// BlockSize uint64 (8 bytes) -// Size uint64 (8 bytes) — total memory size in bytes -// Generation uint64 (8 bytes) -// BuildID [16]byte (UUID) -// BaseBuildID [16]byte (UUID) -type Metadata struct { - Version uint64 - BlockSize uint64 - Size uint64 - Generation uint64 - BuildID uuid.UUID - BaseBuildID uuid.UUID -} - -// NewMetadata creates metadata for a first-generation snapshot. -func NewMetadata(buildID uuid.UUID, blockSize, size uint64) *Metadata { - return &Metadata{ - Version: metadataVersion, - Generation: 0, - BlockSize: blockSize, - Size: size, - BuildID: buildID, - BaseBuildID: buildID, - } -} - -// NextGeneration creates metadata for the next generation in the chain. -func (m *Metadata) NextGeneration(buildID uuid.UUID) *Metadata { - return &Metadata{ - Version: m.Version, - Generation: m.Generation + 1, - BlockSize: m.BlockSize, - Size: m.Size, - BuildID: buildID, - BaseBuildID: m.BaseBuildID, - } -} - -// BuildMap maps a contiguous range of the memory address space to a specific -// generation's diff file. Binary layout (little-endian, 40 bytes): -// -// Offset uint64 — byte offset in the virtual address space -// Length uint64 — byte count (multiple of BlockSize) -// BuildID [16]byte — which generation's diff file, uuid.Nil = zero-fill -// BuildStorageOffset uint64 — byte offset within that generation's diff file -type BuildMap struct { - Offset uint64 - Length uint64 - BuildID uuid.UUID - BuildStorageOffset uint64 -} - -// Header is the in-memory representation of a snapshot's memory mapping. -// It provides O(log N) lookup from any memory offset to the correct -// generation's diff file and offset within it. -type Header struct { - Metadata *Metadata - Mapping []*BuildMap - - // blockStarts tracks which block indices start a new BuildMap entry. - // startMap provides direct access from block index to the BuildMap. - blockStarts []bool - startMap map[int64]*BuildMap -} - -// NewHeader creates a Header from metadata and mapping entries. -// If mapping is nil/empty, a single entry covering the full size is created. -func NewHeader(metadata *Metadata, mapping []*BuildMap) (*Header, error) { - if metadata.BlockSize == 0 { - return nil, fmt.Errorf("block size cannot be zero") - } - - if len(mapping) == 0 { - mapping = []*BuildMap{{ - Offset: 0, - Length: metadata.Size, - BuildID: metadata.BuildID, - BuildStorageOffset: 0, - }} - } - - blocks := TotalBlocks(int64(metadata.Size), int64(metadata.BlockSize)) - starts := make([]bool, blocks) - startMap := make(map[int64]*BuildMap, len(mapping)) - - for _, m := range mapping { - idx := BlockIdx(int64(m.Offset), int64(metadata.BlockSize)) - if idx >= 0 && idx < blocks { - starts[idx] = true - startMap[idx] = m - } - } - - return &Header{ - Metadata: metadata, - Mapping: mapping, - blockStarts: starts, - startMap: startMap, - }, nil -} - -// GetShiftedMapping resolves a memory offset to the corresponding diff file -// offset, remaining length, and build ID. This is the hot path called for -// every UFFD page fault. -func (h *Header) GetShiftedMapping(_ context.Context, offset int64) (mappedOffset int64, mappedLength int64, buildID *uuid.UUID, err error) { - if offset < 0 || offset >= int64(h.Metadata.Size) { - return 0, 0, nil, fmt.Errorf("offset %d out of bounds (size: %d)", offset, h.Metadata.Size) - } - - blockSize := int64(h.Metadata.BlockSize) - block := BlockIdx(offset, blockSize) - - // Walk backwards to find the BuildMap that contains this block. - start := block - for start >= 0 { - if h.blockStarts[start] { - break - } - start-- - } - if start < 0 { - return 0, 0, nil, fmt.Errorf("no mapping found for offset %d", offset) - } - - m, ok := h.startMap[start] - if !ok { - return 0, 0, nil, fmt.Errorf("no mapping at block %d", start) - } - - shift := (block - start) * blockSize - if shift >= int64(m.Length) { - return 0, 0, nil, fmt.Errorf("offset %d beyond mapping end (mapping offset=%d, length=%d)", offset, m.Offset, m.Length) - } - - return int64(m.BuildStorageOffset) + shift, int64(m.Length) - shift, &m.BuildID, nil -} - -// Serialize writes metadata + mapping entries to binary (little-endian). -func Serialize(metadata *Metadata, mappings []*BuildMap) ([]byte, error) { - var buf bytes.Buffer - - if err := binary.Write(&buf, binary.LittleEndian, metadata); err != nil { - return nil, fmt.Errorf("write metadata: %w", err) - } - - for _, m := range mappings { - if err := binary.Write(&buf, binary.LittleEndian, m); err != nil { - return nil, fmt.Errorf("write mapping: %w", err) - } - } - - return buf.Bytes(), nil -} - -// Deserialize reads a header from binary data. -func Deserialize(data []byte) (*Header, error) { - reader := bytes.NewReader(data) - - var metadata Metadata - if err := binary.Read(reader, binary.LittleEndian, &metadata); err != nil { - return nil, fmt.Errorf("read metadata: %w", err) - } - - var mappings []*BuildMap - for { - var m BuildMap - if err := binary.Read(reader, binary.LittleEndian, &m); err != nil { - if errors.Is(err, io.EOF) { - break - } - return nil, fmt.Errorf("read mapping: %w", err) - } - mappings = append(mappings, &m) - } - - return NewHeader(&metadata, mappings) -} - -// Block index helpers. - -func TotalBlocks(size, blockSize int64) int64 { - return (size + blockSize - 1) / blockSize -} - -func BlockIdx(offset, blockSize int64) int64 { - return offset / blockSize -} - -func BlockOffset(idx, blockSize int64) int64 { - return idx * blockSize -} diff --git a/internal/snapshot/local.go b/internal/snapshot/local.go index 95b9574..3ff30b8 100644 --- a/internal/snapshot/local.go +++ b/internal/snapshot/local.go @@ -7,14 +7,15 @@ import ( "os" "path/filepath" "syscall" - - "github.com/google/uuid" ) const ( - SnapFileName = "snapfile" - MemDiffName = "memfile" - MemHeaderName = "memfile.header" + // Cloud Hypervisor snapshot files. + CHConfigFile = "config.json" + CHMemRangesFile = "memory-ranges" + CHStateFile = "state.json" + + // Rootfs files. RootfsFileName = "rootfs.ext4" RootfsCowName = "rootfs.cow" RootfsMetaName = "rootfs.meta" @@ -25,27 +26,6 @@ func DirPath(baseDir, name string) string { return filepath.Join(baseDir, name) } -// SnapPath returns the path to the VM state snapshot file. -func SnapPath(baseDir, name string) string { - return filepath.Join(DirPath(baseDir, name), SnapFileName) -} - -// MemDiffPath returns the path to the compact memory diff file (legacy single-generation). -func MemDiffPath(baseDir, name string) string { - return filepath.Join(DirPath(baseDir, name), MemDiffName) -} - -// MemDiffPathForBuild returns the path to a specific generation's diff file. -// Format: memfile.{buildID} -func MemDiffPathForBuild(baseDir, name string, buildID uuid.UUID) string { - return filepath.Join(DirPath(baseDir, name), fmt.Sprintf("memfile.%s", buildID.String())) -} - -// MemHeaderPath returns the path to the memory mapping header file. -func MemHeaderPath(baseDir, name string) string { - return filepath.Join(DirPath(baseDir, name), MemHeaderName) -} - // RootfsPath returns the path to the rootfs image. func RootfsPath(baseDir, name string) string { return filepath.Join(DirPath(baseDir, name), RootfsFileName) @@ -61,10 +41,13 @@ func MetaPath(baseDir, name string) string { return filepath.Join(DirPath(baseDir, name), RootfsMetaName) } -// RootfsMeta records which base template a CoW file was created against. +// RootfsMeta records which base template a CoW file was created against +// and the VM resource config needed to restart the sampler on resume. type RootfsMeta struct { BaseTemplate string `json:"base_template"` TemplateID string `json:"template_id,omitempty"` + VCPUs int `json:"vcpus,omitempty"` + MemoryMB int `json:"memory_mb,omitempty"` } // WriteMeta writes rootfs metadata to the snapshot directory. @@ -92,102 +75,6 @@ func ReadMeta(baseDir, name string) (*RootfsMeta, error) { return &meta, nil } -// Exists reports whether a complete snapshot exists (all required files present). -// Supports both legacy (rootfs.ext4) and CoW-based (rootfs.cow + rootfs.meta) snapshots. -// Memory diff files can be either legacy "memfile" or generation-specific "memfile.{uuid}". -func Exists(baseDir, name string) bool { - dir := DirPath(baseDir, name) - - // snapfile and header are always required. - for _, f := range []string{SnapFileName, MemHeaderName} { - if _, err := os.Stat(filepath.Join(dir, f)); err != nil { - return false - } - } - - // Check that at least one memfile exists (legacy or generation-specific). - // We verify by reading the header and checking that referenced diff files exist. - // Fall back to checking for the legacy memfile name if header can't be read. - if _, err := os.Stat(filepath.Join(dir, MemDiffName)); err != nil { - // No legacy memfile — check if any memfile.{uuid} exists by - // looking for files matching the pattern. - matches, _ := filepath.Glob(filepath.Join(dir, "memfile.*")) - hasGenDiff := false - for _, m := range matches { - base := filepath.Base(m) - if base != MemHeaderName { - hasGenDiff = true - break - } - } - if !hasGenDiff { - return false - } - } - - // Accept either rootfs.ext4 (legacy/template) or rootfs.cow + rootfs.meta (dm-snapshot). - if _, err := os.Stat(filepath.Join(dir, RootfsFileName)); err == nil { - return true - } - if _, err := os.Stat(filepath.Join(dir, RootfsCowName)); err == nil { - if _, err := os.Stat(filepath.Join(dir, RootfsMetaName)); err == nil { - return true - } - } - return false -} - -// IsTemplate reports whether a template image directory exists (has rootfs.ext4). -func IsTemplate(baseDir, name string) bool { - _, err := os.Stat(filepath.Join(DirPath(baseDir, name), RootfsFileName)) - return err == nil -} - -// IsSnapshot reports whether a directory is a snapshot (has all snapshot files). -func IsSnapshot(baseDir, name string) bool { - return Exists(baseDir, name) -} - -// HasCow reports whether a snapshot uses CoW format (rootfs.cow + rootfs.meta) -// as opposed to legacy full rootfs (rootfs.ext4). -func HasCow(baseDir, name string) bool { - dir := DirPath(baseDir, name) - _, cowErr := os.Stat(filepath.Join(dir, RootfsCowName)) - _, metaErr := os.Stat(filepath.Join(dir, RootfsMetaName)) - return cowErr == nil && metaErr == nil -} - -// ListDiffFiles returns a map of build ID → file path for all memory diff files -// referenced by the given header. Handles both the legacy "memfile" name -// (single-generation) and generation-specific "memfile.{uuid}" names. -func ListDiffFiles(baseDir, name string, header *Header) (map[string]string, error) { - dir := DirPath(baseDir, name) - result := make(map[string]string) - - for _, m := range header.Mapping { - if m.BuildID == uuid.Nil { - continue // zero-fill, no file needed - } - idStr := m.BuildID.String() - if _, exists := result[idStr]; exists { - continue - } - // Try generation-specific path first, fall back to legacy. - genPath := filepath.Join(dir, fmt.Sprintf("memfile.%s", idStr)) - if _, err := os.Stat(genPath); err == nil { - result[idStr] = genPath - continue - } - legacyPath := filepath.Join(dir, MemDiffName) - if _, err := os.Stat(legacyPath); err == nil { - result[idStr] = legacyPath - continue - } - return nil, fmt.Errorf("diff file not found for build %s", idStr) - } - return result, nil -} - // EnsureDir creates the snapshot directory if it doesn't exist. func EnsureDir(baseDir, name string) error { dir := DirPath(baseDir, name) diff --git a/internal/snapshot/mapping.go b/internal/snapshot/mapping.go deleted file mode 100644 index 8451518..0000000 --- a/internal/snapshot/mapping.go +++ /dev/null @@ -1,214 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// Modifications by M/S Omukk - -package snapshot - -import "github.com/google/uuid" - -// CreateMapping converts a dirty-block bitset (represented as a []bool) into -// a sorted list of BuildMap entries. Consecutive dirty blocks are merged into -// a single entry. BuildStorageOffset tracks the sequential position in the -// compact diff file. -func CreateMapping(buildID uuid.UUID, dirty []bool, blockSize int64) []*BuildMap { - var mappings []*BuildMap - var runStart int64 = -1 - var runLength int64 - var storageOffset uint64 - - for i, set := range dirty { - if !set { - if runLength > 0 { - mappings = append(mappings, &BuildMap{ - Offset: uint64(runStart) * uint64(blockSize), - Length: uint64(runLength) * uint64(blockSize), - BuildID: buildID, - BuildStorageOffset: storageOffset, - }) - storageOffset += uint64(runLength) * uint64(blockSize) - runLength = 0 - } - runStart = -1 - continue - } - - if runStart < 0 { - runStart = int64(i) - runLength = 1 - } else { - runLength++ - } - } - - if runLength > 0 { - mappings = append(mappings, &BuildMap{ - Offset: uint64(runStart) * uint64(blockSize), - Length: uint64(runLength) * uint64(blockSize), - BuildID: buildID, - BuildStorageOffset: storageOffset, - }) - } - - return mappings -} - -// MergeMappings overlays diffMapping on top of baseMapping. Where they overlap, -// diff takes priority. The result covers the entire address space. -// -// Both inputs must be sorted by Offset. The base mapping should cover the full size. -// -// Inspired by e2b's snapshot system (Apache 2.0, modified by Omukk). -func MergeMappings(baseMapping, diffMapping []*BuildMap) []*BuildMap { - if len(diffMapping) == 0 { - return baseMapping - } - - // Work on a copy of baseMapping to avoid mutating the original. - baseCopy := make([]*BuildMap, len(baseMapping)) - for i, m := range baseMapping { - cp := *m - baseCopy[i] = &cp - } - - var result []*BuildMap - var bi, di int - - for bi < len(baseCopy) && di < len(diffMapping) { - base := baseCopy[bi] - diff := diffMapping[di] - - if base.Length == 0 { - bi++ - continue - } - if diff.Length == 0 { - di++ - continue - } - - // No overlap: base entirely before diff. - if base.Offset+base.Length <= diff.Offset { - result = append(result, base) - bi++ - continue - } - - // No overlap: diff entirely before base. - if diff.Offset+diff.Length <= base.Offset { - result = append(result, diff) - di++ - continue - } - - // Base fully inside diff — skip base. - if base.Offset >= diff.Offset && base.Offset+base.Length <= diff.Offset+diff.Length { - bi++ - continue - } - - // Diff fully inside base — split base around diff. - if diff.Offset >= base.Offset && diff.Offset+diff.Length <= base.Offset+base.Length { - leftLen := int64(diff.Offset) - int64(base.Offset) - if leftLen > 0 { - result = append(result, &BuildMap{ - Offset: base.Offset, - Length: uint64(leftLen), - BuildID: base.BuildID, - BuildStorageOffset: base.BuildStorageOffset, - }) - } - - result = append(result, diff) - di++ - - rightShift := int64(diff.Offset) + int64(diff.Length) - int64(base.Offset) - rightLen := int64(base.Length) - rightShift - - if rightLen > 0 { - baseCopy[bi] = &BuildMap{ - Offset: base.Offset + uint64(rightShift), - Length: uint64(rightLen), - BuildID: base.BuildID, - BuildStorageOffset: base.BuildStorageOffset + uint64(rightShift), - } - } else { - bi++ - } - continue - } - - // Base starts after diff with overlap — emit diff, trim base. - if base.Offset > diff.Offset { - result = append(result, diff) - di++ - - rightShift := int64(diff.Offset) + int64(diff.Length) - int64(base.Offset) - rightLen := int64(base.Length) - rightShift - - if rightLen > 0 { - baseCopy[bi] = &BuildMap{ - Offset: base.Offset + uint64(rightShift), - Length: uint64(rightLen), - BuildID: base.BuildID, - BuildStorageOffset: base.BuildStorageOffset + uint64(rightShift), - } - } else { - bi++ - } - continue - } - - // Diff starts after base with overlap — emit left part of base. - if diff.Offset > base.Offset { - leftLen := int64(diff.Offset) - int64(base.Offset) - if leftLen > 0 { - result = append(result, &BuildMap{ - Offset: base.Offset, - Length: uint64(leftLen), - BuildID: base.BuildID, - BuildStorageOffset: base.BuildStorageOffset, - }) - } - bi++ - continue - } - } - - // Append remaining entries. - result = append(result, baseCopy[bi:]...) - result = append(result, diffMapping[di:]...) - - return result -} - -// NormalizeMappings merges adjacent entries with the same BuildID. -func NormalizeMappings(mappings []*BuildMap) []*BuildMap { - if len(mappings) == 0 { - return nil - } - - result := make([]*BuildMap, 0, len(mappings)) - current := &BuildMap{ - Offset: mappings[0].Offset, - Length: mappings[0].Length, - BuildID: mappings[0].BuildID, - BuildStorageOffset: mappings[0].BuildStorageOffset, - } - - for i := 1; i < len(mappings); i++ { - m := mappings[i] - if m.BuildID == current.BuildID { - current.Length += m.Length - } else { - result = append(result, current) - current = &BuildMap{ - Offset: m.Offset, - Length: m.Length, - BuildID: m.BuildID, - BuildStorageOffset: m.BuildStorageOffset, - } - } - } - result = append(result, current) - - return result -} diff --git a/internal/snapshot/memfile.go b/internal/snapshot/memfile.go deleted file mode 100644 index f7b14f9..0000000 --- a/internal/snapshot/memfile.go +++ /dev/null @@ -1,285 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// Modifications by M/S Omukk - -package snapshot - -import ( - "context" - "fmt" - "io" - "os" - - "github.com/google/uuid" -) - -const ( - // DefaultBlockSize is 4KB — standard page size for Firecracker. - DefaultBlockSize int64 = 4096 -) - -// ProcessMemfile reads a full memory file produced by Firecracker's -// PUT /snapshot/create, identifies non-zero blocks, and writes only those -// blocks to a compact diff file. Returns the Header describing the mapping. -// -// The output diff file contains non-zero blocks written sequentially. -// The header maps each block in the full address space to either: -// - A position in the diff file (for non-zero blocks) -// - uuid.Nil (for zero/empty blocks, served as zeros without I/O) -// -// buildID identifies this snapshot generation in the header chain. -func ProcessMemfile(memfilePath, diffPath, headerPath string, buildID uuid.UUID) (*Header, error) { - src, err := os.Open(memfilePath) - if err != nil { - return nil, fmt.Errorf("open memfile: %w", err) - } - defer src.Close() - - info, err := src.Stat() - if err != nil { - return nil, fmt.Errorf("stat memfile: %w", err) - } - memSize := info.Size() - - dst, err := os.Create(diffPath) - if err != nil { - return nil, fmt.Errorf("create diff file: %w", err) - } - defer dst.Close() - - totalBlocks := TotalBlocks(memSize, DefaultBlockSize) - dirty := make([]bool, totalBlocks) - empty := make([]bool, totalBlocks) - buf := make([]byte, DefaultBlockSize) - - for i := int64(0); i < totalBlocks; i++ { - n, err := io.ReadFull(src, buf) - if err != nil && err != io.ErrUnexpectedEOF { - return nil, fmt.Errorf("read block %d: %w", i, err) - } - - // Zero-pad the last block if it's short. - if int64(n) < DefaultBlockSize { - for j := n; j < int(DefaultBlockSize); j++ { - buf[j] = 0 - } - } - - if isZeroBlock(buf) { - empty[i] = true - continue - } - - dirty[i] = true - if _, err := dst.Write(buf); err != nil { - return nil, fmt.Errorf("write diff block %d: %w", i, err) - } - } - - // Build header. - dirtyMappings := CreateMapping(buildID, dirty, DefaultBlockSize) - emptyMappings := CreateMapping(uuid.Nil, empty, DefaultBlockSize) - merged := MergeMappings(dirtyMappings, emptyMappings) - normalized := NormalizeMappings(merged) - - metadata := NewMetadata(buildID, uint64(DefaultBlockSize), uint64(memSize)) - header, err := NewHeader(metadata, normalized) - if err != nil { - return nil, fmt.Errorf("create header: %w", err) - } - - // Write header to disk. - headerData, err := Serialize(metadata, normalized) - if err != nil { - return nil, fmt.Errorf("serialize header: %w", err) - } - if err := os.WriteFile(headerPath, headerData, 0644); err != nil { - return nil, fmt.Errorf("write header: %w", err) - } - - return header, nil -} - -// ProcessMemfileWithParent processes a memory file as a new generation on top -// of an existing parent header. The new diff file contains only blocks that -// differ from what the parent header maps. This is used for re-pause of a -// sandbox that was restored from a snapshot. -func ProcessMemfileWithParent(memfilePath, diffPath, headerPath string, parentHeader *Header, buildID uuid.UUID) (*Header, error) { - src, err := os.Open(memfilePath) - if err != nil { - return nil, fmt.Errorf("open memfile: %w", err) - } - defer src.Close() - - info, err := src.Stat() - if err != nil { - return nil, fmt.Errorf("stat memfile: %w", err) - } - memSize := info.Size() - - dst, err := os.Create(diffPath) - if err != nil { - return nil, fmt.Errorf("create diff file: %w", err) - } - defer dst.Close() - - totalBlocks := TotalBlocks(memSize, DefaultBlockSize) - dirty := make([]bool, totalBlocks) - buf := make([]byte, DefaultBlockSize) - - for i := int64(0); i < totalBlocks; i++ { - n, err := io.ReadFull(src, buf) - if err != nil && err != io.ErrUnexpectedEOF { - return nil, fmt.Errorf("read block %d: %w", i, err) - } - - if int64(n) < DefaultBlockSize { - for j := n; j < int(DefaultBlockSize); j++ { - buf[j] = 0 - } - } - - if isZeroBlock(buf) { - // For a diff memfile, zero blocks mean "not dirtied since resume" — - // they should inherit the parent's mapping, not be zero-filled. - continue - } - - dirty[i] = true - if _, err := dst.Write(buf); err != nil { - return nil, fmt.Errorf("write diff block %d: %w", i, err) - } - } - - // Only dirty blocks go into the diff overlay; MergeMappings preserves the - // parent's mapping for everything else. - dirtyMappings := CreateMapping(buildID, dirty, DefaultBlockSize) - merged := MergeMappings(parentHeader.Mapping, dirtyMappings) - normalized := NormalizeMappings(merged) - - metadata := parentHeader.Metadata.NextGeneration(buildID) - header, err := NewHeader(metadata, normalized) - if err != nil { - return nil, fmt.Errorf("create header: %w", err) - } - - headerData, err := Serialize(metadata, normalized) - if err != nil { - return nil, fmt.Errorf("serialize header: %w", err) - } - if err := os.WriteFile(headerPath, headerData, 0644); err != nil { - return nil, fmt.Errorf("write header: %w", err) - } - - return header, nil -} - -// MergeDiffs consolidates multiple generation diff files into a single diff -// file and resets the generation counter to 0. This is a pure file-level -// operation — no Firecracker involvement. -// -// It reads each non-nil block from the appropriate diff file (as mapped by -// the header), writes them all sequentially into a single new diff file, -// and produces a fresh header pointing only at that file. -// -// diffFiles maps build ID (string) → open file path for each generation's diff. -func MergeDiffs(header *Header, diffFiles map[string]string, mergedDiffPath, headerPath string) (*Header, error) { - blockSize := int64(header.Metadata.BlockSize) - mergedBuildID := uuid.New() - - // Open all source diff files. - sources := make(map[string]*os.File, len(diffFiles)) - for id, path := range diffFiles { - f, err := os.Open(path) - if err != nil { - // Close already opened files. - for _, sf := range sources { - sf.Close() - } - return nil, fmt.Errorf("open diff file for build %s: %w", id, err) - } - sources[id] = f - } - defer func() { - for _, f := range sources { - f.Close() - } - }() - - dst, err := os.Create(mergedDiffPath) - if err != nil { - return nil, fmt.Errorf("create merged diff file: %w", err) - } - defer dst.Close() - - totalBlocks := TotalBlocks(int64(header.Metadata.Size), blockSize) - dirty := make([]bool, totalBlocks) - empty := make([]bool, totalBlocks) - buf := make([]byte, blockSize) - - for i := int64(0); i < totalBlocks; i++ { - offset := i * blockSize - mappedOffset, _, buildID, err := header.GetShiftedMapping(context.Background(), offset) - if err != nil { - return nil, fmt.Errorf("lookup block %d: %w", i, err) - } - - if *buildID == uuid.Nil { - empty[i] = true - continue - } - - src, ok := sources[buildID.String()] - if !ok { - return nil, fmt.Errorf("no diff file for build %s (block %d)", buildID, i) - } - - if _, err := src.ReadAt(buf, mappedOffset); err != nil { - return nil, fmt.Errorf("read block %d from build %s: %w", i, buildID, err) - } - - dirty[i] = true - if _, err := dst.Write(buf); err != nil { - return nil, fmt.Errorf("write merged block %d: %w", i, err) - } - } - - // Build fresh header with generation 0. - dirtyMappings := CreateMapping(mergedBuildID, dirty, blockSize) - emptyMappings := CreateMapping(uuid.Nil, empty, blockSize) - merged := MergeMappings(dirtyMappings, emptyMappings) - normalized := NormalizeMappings(merged) - - metadata := NewMetadata(mergedBuildID, uint64(blockSize), header.Metadata.Size) - newHeader, err := NewHeader(metadata, normalized) - if err != nil { - return nil, fmt.Errorf("create merged header: %w", err) - } - - headerData, err := Serialize(metadata, normalized) - if err != nil { - return nil, fmt.Errorf("serialize merged header: %w", err) - } - if err := os.WriteFile(headerPath, headerData, 0644); err != nil { - return nil, fmt.Errorf("write merged header: %w", err) - } - - return newHeader, nil -} - -// isZeroBlock checks if a block is entirely zero bytes. -func isZeroBlock(block []byte) bool { - // Fast path: compare 8 bytes at a time. - for i := 0; i+8 <= len(block); i += 8 { - if block[i] != 0 || block[i+1] != 0 || block[i+2] != 0 || block[i+3] != 0 || - block[i+4] != 0 || block[i+5] != 0 || block[i+6] != 0 || block[i+7] != 0 { - return false - } - } - // Tail bytes. - for i := len(block) &^ 7; i < len(block); i++ { - if block[i] != 0 { - return false - } - } - return true -} diff --git a/internal/uffd/fd.go b/internal/uffd/fd.go deleted file mode 100644 index 8e0fba2..0000000 --- a/internal/uffd/fd.go +++ /dev/null @@ -1,92 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// Modifications by M/S Omukk - -// Package uffd implements a userfaultfd-based memory server for Firecracker -// snapshot restore. When a VM is restored from a snapshot, instead of loading -// the entire memory file upfront, the UFFD handler intercepts page faults -// and serves memory pages on demand from the snapshot's compact diff file. -package uffd - -/* -#include -#include -#include -#include - -struct uffd_pagefault { - __u64 flags; - __u64 address; - __u32 ptid; -}; -*/ -import "C" - -import ( - "fmt" - "syscall" - "unsafe" -) - -const ( - UFFD_EVENT_PAGEFAULT = C.UFFD_EVENT_PAGEFAULT - UFFD_EVENT_FORK = C.UFFD_EVENT_FORK - UFFD_EVENT_REMAP = C.UFFD_EVENT_REMAP - UFFD_EVENT_REMOVE = C.UFFD_EVENT_REMOVE - UFFD_EVENT_UNMAP = C.UFFD_EVENT_UNMAP - UFFD_PAGEFAULT_FLAG_WRITE = C.UFFD_PAGEFAULT_FLAG_WRITE - UFFDIO_COPY = C.UFFDIO_COPY - UFFDIO_COPY_MODE_WP = C.UFFDIO_COPY_MODE_WP -) - -type ( - uffdMsg = C.struct_uffd_msg - uffdPagefault = C.struct_uffd_pagefault - uffdioCopy = C.struct_uffdio_copy -) - -// fd wraps a userfaultfd file descriptor received from Firecracker. -type fd uintptr - -// copy installs a page into guest memory at the given address using UFFDIO_COPY. -// mode controls write-protection: use UFFDIO_COPY_MODE_WP to preserve WP bit. -func (f fd) copy(addr, pagesize uintptr, data []byte, mode C.ulonglong) error { - alignedAddr := addr &^ (pagesize - 1) - cpy := uffdioCopy{ - src: C.ulonglong(uintptr(unsafe.Pointer(&data[0]))), - dst: C.ulonglong(alignedAddr), - len: C.ulonglong(pagesize), - mode: mode, - copy: 0, - } - - _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(f), UFFDIO_COPY, uintptr(unsafe.Pointer(&cpy))) - if errno != 0 { - return errno - } - - if cpy.copy != C.longlong(pagesize) { - return fmt.Errorf("UFFDIO_COPY copied %d bytes, expected %d", cpy.copy, pagesize) - } - - return nil -} - -// close closes the userfaultfd file descriptor. -func (f fd) close() error { - return syscall.Close(int(f)) -} - -// getMsgEvent extracts the event type from a uffd_msg. -func getMsgEvent(msg *uffdMsg) C.uchar { - return msg.event -} - -// getMsgArg extracts the arg union from a uffd_msg. -func getMsgArg(msg *uffdMsg) [24]byte { - return msg.arg -} - -// getPagefaultAddress extracts the faulting address from a uffd_pagefault. -func getPagefaultAddress(pf *uffdPagefault) uintptr { - return uintptr(pf.address) -} diff --git a/internal/uffd/region.go b/internal/uffd/region.go deleted file mode 100644 index 20b3921..0000000 --- a/internal/uffd/region.go +++ /dev/null @@ -1,41 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// Modifications by M/S Omukk -// -// Modifications by Omukk (Wrenn Sandbox): merged Region and Mapping into -// single file, inlined shiftedOffset helper. - -package uffd - -import "fmt" - -// Region is a mapping of guest memory to host virtual address space. -// Firecracker sends these as JSON when connecting to the UFFD socket. -// The JSON field names match Firecracker's UFFD protocol. -type Region struct { - BaseHostVirtAddr uintptr `json:"base_host_virt_addr"` - Size uintptr `json:"size"` - Offset uintptr `json:"offset"` - PageSize uintptr `json:"page_size_kib"` // Actually in bytes despite the name. -} - -// Mapping translates between host virtual addresses and logical memory offsets. -type Mapping struct { - Regions []Region -} - -// NewMapping creates a Mapping from a list of regions. -func NewMapping(regions []Region) *Mapping { - return &Mapping{Regions: regions} -} - -// GetOffset converts a host virtual address to a logical memory file offset -// and returns the page size. This is called on every UFFD page fault. -func (m *Mapping) GetOffset(hostVirtAddr uintptr) (int64, uintptr, error) { - for _, r := range m.Regions { - if hostVirtAddr >= r.BaseHostVirtAddr && hostVirtAddr < r.BaseHostVirtAddr+r.Size { - offset := int64(hostVirtAddr-r.BaseHostVirtAddr) + int64(r.Offset) - return offset, r.PageSize, nil - } - } - return 0, 0, fmt.Errorf("address %#x not found in any memory region", hostVirtAddr) -} diff --git a/internal/uffd/server.go b/internal/uffd/server.go deleted file mode 100644 index d7fd8d0..0000000 --- a/internal/uffd/server.go +++ /dev/null @@ -1,451 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// Modifications by M/S Omukk -// -// Modifications by Omukk (Wrenn Sandbox): replaced errgroup with WaitGroup -// + semaphore, replaced fdexit abstraction with pipe, integrated with -// snapshot.Header-based DiffFileSource instead of block.ReadonlyDevice, -// fixed EAGAIN handling in poll loop. - -package uffd - -/* -#include -*/ -import "C" - -import ( - "context" - "encoding/json" - "errors" - "fmt" - "log/slog" - "net" - "os" - "sync" - "syscall" - "unsafe" - - "golang.org/x/sys/unix" - - "git.omukk.dev/wrenn/wrenn/internal/snapshot" -) - -const ( - fdSize = 4 - regionMappingsSize = 1024 - maxConcurrentFaults = 4096 -) - -// MemorySource provides page data for the UFFD handler. -// Given a logical memory offset and a size, it returns the page data. -type MemorySource interface { - ReadPage(ctx context.Context, offset int64, size int64) ([]byte, error) -} - -// Server manages the UFFD Unix socket lifecycle and page fault handling -// for a single Firecracker snapshot restore. -type Server struct { - socketPath string - source MemorySource - lis *net.UnixListener - - readyCh chan struct{} - readyOnce sync.Once - doneCh chan struct{} - doneErr error - - // exitPipe signals the poll loop to stop. - exitR *os.File - exitW *os.File - - // Set by handle() after Firecracker connects; read by Prefetch() - // after waiting on readyCh (which establishes happens-before). - uffdFd fd - mapping *Mapping - - // Prefetch lifecycle: cancel stops the goroutine, prefetchDone is - // closed when it exits. Stop() drains prefetchDone before returning - // so the caller can safely close diff file handles. - prefetchCancel context.CancelFunc - prefetchDone chan struct{} -} - -// NewServer creates a UFFD server that will listen on the given socket path -// and serve memory pages from the given source. -func NewServer(socketPath string, source MemorySource) *Server { - return &Server{ - socketPath: socketPath, - source: source, - readyCh: make(chan struct{}), - doneCh: make(chan struct{}), - } -} - -// Start begins listening on the Unix socket. Firecracker will connect to this -// socket after loadSnapshot is called with the UFFD backend. -// Start returns immediately; the server runs in a background goroutine. -func (s *Server) Start(ctx context.Context) error { - lis, err := net.ListenUnix("unix", &net.UnixAddr{Name: s.socketPath, Net: "unix"}) - if err != nil { - return fmt.Errorf("listen on uffd socket: %w", err) - } - s.lis = lis - - if err := os.Chmod(s.socketPath, 0o777); err != nil { - lis.Close() - return fmt.Errorf("chmod uffd socket: %w", err) - } - - // Create exit signal pipe. - r, w, err := os.Pipe() - if err != nil { - lis.Close() - return fmt.Errorf("create exit pipe: %w", err) - } - s.exitR = r - s.exitW = w - - go func() { - defer close(s.doneCh) - s.doneErr = s.handle(ctx) - s.lis.Close() - s.exitR.Close() - s.exitW.Close() - s.readyOnce.Do(func() { close(s.readyCh) }) - }() - - return nil -} - -// Ready returns a channel that is closed when the UFFD handler is ready -// (after Firecracker has connected and sent the uffd fd). -func (s *Server) Ready() <-chan struct{} { - return s.readyCh -} - -// Stop signals the UFFD poll loop to exit and waits for it to finish. -// Also cancels and waits for any running prefetch goroutine. -func (s *Server) Stop() error { - if s.prefetchCancel != nil { - s.prefetchCancel() - } - // Write a byte to the exit pipe to wake the poll loop. - _, _ = s.exitW.Write([]byte{0}) - <-s.doneCh - if s.prefetchDone != nil { - <-s.prefetchDone - } - return s.doneErr -} - -// Wait blocks until the server exits. -func (s *Server) Wait() error { - <-s.doneCh - return s.doneErr -} - -// handle accepts the Firecracker connection, receives the UFFD fd via -// SCM_RIGHTS, and runs the page fault poll loop. -func (s *Server) handle(ctx context.Context) error { - conn, err := s.lis.Accept() - if err != nil { - return fmt.Errorf("accept uffd connection: %w", err) - } - - unixConn := conn.(*net.UnixConn) - defer unixConn.Close() - - // Read the memory region mappings (JSON) and the UFFD fd (SCM_RIGHTS). - regionBuf := make([]byte, regionMappingsSize) - uffdBuf := make([]byte, syscall.CmsgSpace(fdSize)) - - nRegion, nFd, _, _, err := unixConn.ReadMsgUnix(regionBuf, uffdBuf) - if err != nil { - return fmt.Errorf("read uffd message: %w", err) - } - - var regions []Region - if err := json.Unmarshal(regionBuf[:nRegion], ®ions); err != nil { - return fmt.Errorf("parse memory regions: %w", err) - } - - controlMsgs, err := syscall.ParseSocketControlMessage(uffdBuf[:nFd]) - if err != nil { - return fmt.Errorf("parse control messages: %w", err) - } - if len(controlMsgs) != 1 { - return fmt.Errorf("expected 1 control message, got %d", len(controlMsgs)) - } - - fds, err := syscall.ParseUnixRights(&controlMsgs[0]) - if err != nil { - return fmt.Errorf("parse unix rights: %w", err) - } - if len(fds) != 1 { - return fmt.Errorf("expected 1 fd, got %d", len(fds)) - } - - uffdFd := fd(fds[0]) - defer uffdFd.close() - - mapping := NewMapping(regions) - - // Store for use by Prefetch(). - s.uffdFd = uffdFd - s.mapping = mapping - - slog.Info("uffd handler connected", - "regions", len(regions), - "fd", int(uffdFd), - ) - - // Signal readiness. - s.readyOnce.Do(func() { close(s.readyCh) }) - - // Run the poll loop. - return s.serve(ctx, uffdFd, mapping) -} - -// serve is the main poll loop. It polls the UFFD fd for page fault events -// and the exit pipe for shutdown signals. -func (s *Server) serve(ctx context.Context, uffdFd fd, mapping *Mapping) error { - pollFds := []unix.PollFd{ - {Fd: int32(uffdFd), Events: unix.POLLIN}, - {Fd: int32(s.exitR.Fd()), Events: unix.POLLIN}, - } - - var wg sync.WaitGroup - sem := make(chan struct{}, maxConcurrentFaults) - - // Always wait for in-flight goroutines before returning, so the caller - // can safely close the uffd fd after serve returns. - defer wg.Wait() - - for { - if _, err := unix.Poll(pollFds, -1); err != nil { - if err == unix.EINTR || err == unix.EAGAIN { - continue - } - return fmt.Errorf("poll: %w", err) - } - - // Check exit signal. - if pollFds[1].Revents&unix.POLLIN != 0 { - return nil - } - - if pollFds[0].Revents&unix.POLLIN == 0 { - continue - } - - // Read the uffd_msg. The fd is O_NONBLOCK (set by Firecracker), - // so EAGAIN is expected — just go back to poll. - buf := make([]byte, unsafe.Sizeof(uffdMsg{})) - n, err := readUffdMsg(uffdFd, buf) - if err == syscall.EAGAIN { - continue - } - if err != nil { - return fmt.Errorf("read uffd msg: %w", err) - } - if n == 0 { - continue - } - - msg := *(*uffdMsg)(unsafe.Pointer(&buf[0])) - event := getMsgEvent(&msg) - - switch event { - case UFFD_EVENT_PAGEFAULT: - // Handled below. - case UFFD_EVENT_REMOVE, UFFD_EVENT_UNMAP, UFFD_EVENT_REMAP, UFFD_EVENT_FORK: - // Non-fatal lifecycle events from the guest kernel (e.g. balloon - // deflation, mmap/munmap). No action needed — continue polling. - continue - default: - return fmt.Errorf("unexpected uffd event type: %d", event) - } - - arg := getMsgArg(&msg) - pf := *(*uffdPagefault)(unsafe.Pointer(&arg[0])) - addr := getPagefaultAddress(&pf) - - offset, pagesize, err := mapping.GetOffset(addr) - if err != nil { - return fmt.Errorf("resolve address %#x: %w", addr, err) - } - - sem <- struct{}{} - wg.Add(1) - go func() { - defer wg.Done() - defer func() { <-sem }() - - if err := s.faultPage(ctx, uffdFd, addr, offset, pagesize); err != nil { - slog.Error("uffd fault page error", - "addr", fmt.Sprintf("%#x", addr), - "offset", offset, - "error", err, - ) - } - }() - } -} - -// readUffdMsg reads a single uffd_msg, retrying on EINTR. -// Returns (n, EAGAIN) if the non-blocking read has nothing available. -func readUffdMsg(uffdFd fd, buf []byte) (int, error) { - for { - n, err := syscall.Read(int(uffdFd), buf) - if err == syscall.EINTR { - continue - } - return n, err - } -} - -// faultPage fetches a page from the memory source and copies it into -// guest memory via UFFDIO_COPY. -func (s *Server) faultPage(ctx context.Context, uffdFd fd, addr uintptr, offset int64, pagesize uintptr) error { - data, err := s.source.ReadPage(ctx, offset, int64(pagesize)) - if err != nil { - return fmt.Errorf("read page at offset %d: %w", offset, err) - } - - // Mode 0: no write-protect. Standard Firecracker does not register - // UFFD ranges with WP support, so UFFDIO_COPY_MODE_WP would fail. - if err := uffdFd.copy(addr, pagesize, data, 0); err != nil { - if errors.Is(err, unix.EEXIST) { - // Page already mapped (race with prefetch or concurrent fault). - return nil - } - return fmt.Errorf("uffdio_copy: %w", err) - } - - return nil -} - -// Prefetch proactively loads all guest memory pages in the background. -// It iterates over every page in every UFFD region and copies it from the -// diff file into guest memory via UFFDIO_COPY. Pages already loaded by -// on-demand faults return nil from faultPage (EEXIST handled internally). -// This eliminates the per-request latency caused by lazy page faulting -// after snapshot restore. -// -// The goroutine blocks on readyCh before reading the uffd fd and mapping -// fields (establishes happens-before with handle()). It uses an internal -// context independent of the caller's RPC context so it survives after the -// create/resume RPC returns. Stop() cancels and joins the goroutine. -func (s *Server) Prefetch() { - ctx, cancel := context.WithCancel(context.Background()) - s.prefetchCancel = cancel - s.prefetchDone = make(chan struct{}) - - go func() { - defer close(s.prefetchDone) - - // Wait for Firecracker to connect and send the uffd fd. - select { - case <-s.readyCh: - case <-ctx.Done(): - return - } - - uffdFd := s.uffdFd - mapping := s.mapping - if mapping == nil { - return - } - - var total, errored int - for _, region := range mapping.Regions { - pageSize := region.PageSize - if pageSize == 0 { - continue - } - for off := uintptr(0); off < region.Size; off += pageSize { - if ctx.Err() != nil { - slog.Debug("uffd prefetch cancelled", - "pages", total, "errors", errored) - return - } - - addr := region.BaseHostVirtAddr + off - memOffset := int64(off) + int64(region.Offset) - - if err := s.faultPage(ctx, uffdFd, addr, memOffset, pageSize); err != nil { - errored++ - } else { - total++ - } - } - } - slog.Info("uffd prefetch complete", - "pages", total, "errors", errored) - }() -} - -// DiffFileSource serves pages from a snapshot's compact diff file using -// the header's block mapping to resolve offsets. -type DiffFileSource struct { - header *snapshot.Header - // diffs maps build ID → open file handle for each generation's diff file. - diffs map[string]*os.File -} - -// NewDiffFileSource creates a memory source backed by snapshot diff files. -// diffs maps build ID string to the file path of each generation's diff file. -func NewDiffFileSource(header *snapshot.Header, diffPaths map[string]string) (*DiffFileSource, error) { - diffs := make(map[string]*os.File, len(diffPaths)) - for id, path := range diffPaths { - f, err := os.Open(path) - if err != nil { - // Close already opened files. - for _, opened := range diffs { - opened.Close() - } - return nil, fmt.Errorf("open diff file %s: %w", path, err) - } - diffs[id] = f - } - return &DiffFileSource{header: header, diffs: diffs}, nil -} - -// ReadPage resolves a memory offset through the header mapping and reads -// the corresponding page from the correct generation's diff file. -func (s *DiffFileSource) ReadPage(ctx context.Context, offset int64, size int64) ([]byte, error) { - mappedOffset, _, buildID, err := s.header.GetShiftedMapping(ctx, offset) - if err != nil { - return nil, fmt.Errorf("resolve offset %d: %w", offset, err) - } - - // uuid.Nil means zero-fill (empty page). - var nilUUID [16]byte - if *buildID == nilUUID { - return make([]byte, size), nil - } - - f, ok := s.diffs[buildID.String()] - if !ok { - return nil, fmt.Errorf("no diff file for build %s", buildID) - } - - buf := make([]byte, size) - n, err := f.ReadAt(buf, mappedOffset) - if err != nil && int64(n) < size { - return nil, fmt.Errorf("read diff at offset %d: %w", mappedOffset, err) - } - - return buf, nil -} - -// Close closes all open diff file handles. -func (s *DiffFileSource) Close() error { - var errs []error - for _, f := range s.diffs { - if err := f.Close(); err != nil { - errs = append(errs, err) - } - } - return errors.Join(errs...) -} diff --git a/internal/vm/ch.go b/internal/vm/ch.go new file mode 100644 index 0000000..0c365ba --- /dev/null +++ b/internal/vm/ch.go @@ -0,0 +1,213 @@ +package vm + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "net" + "net/http" +) + +// chClient talks to the Cloud Hypervisor HTTP API over a Unix socket. +type chClient struct { + http *http.Client + socketPath string +} + +func newCHClient(socketPath string) *chClient { + return &chClient{ + socketPath: socketPath, + http: &http.Client{ + Transport: &http.Transport{ + DialContext: func(ctx context.Context, _, _ string) (net.Conn, error) { + var d net.Dialer + return d.DialContext(ctx, "unix", socketPath) + }, + }, + }, + } +} + +func (c *chClient) do(ctx context.Context, method, path string, body any) error { + var bodyReader io.Reader + if body != nil { + data, err := json.Marshal(body) + if err != nil { + return fmt.Errorf("marshal request body: %w", err) + } + bodyReader = bytes.NewReader(data) + } + + req, err := http.NewRequestWithContext(ctx, method, "http://localhost"+path, bodyReader) + if err != nil { + return fmt.Errorf("create request: %w", err) + } + if body != nil { + req.Header.Set("Content-Type", "application/json") + } + + resp, err := c.http.Do(req) + if err != nil { + return fmt.Errorf("%s %s: %w", method, path, err) + } + defer resp.Body.Close() + + if resp.StatusCode >= 300 { + respBody, _ := io.ReadAll(resp.Body) + return fmt.Errorf("%s %s: status %d: %s", method, path, resp.StatusCode, string(respBody)) + } + + return nil +} + +// --- CH API payload types --- + +type chPayload struct { + Firmware string `json:"firmware,omitempty"` + Kernel string `json:"kernel"` + Cmdline string `json:"cmdline"` +} + +type chCPUs struct { + BootVCPUs int `json:"boot_vcpus"` + MaxVCPUs int `json:"max_vcpus"` +} + +type chMemory struct { + Size uint64 `json:"size"` + Shared bool `json:"shared,omitempty"` + HotplugSize uint64 `json:"hotplug_size,omitempty"` + HotplugMethod string `json:"hotplug_method,omitempty"` +} + +type chDisk struct { + Path string `json:"path"` + Readonly bool `json:"readonly,omitempty"` + ImageType string `json:"image_type,omitempty"` +} + +type chNet struct { + Tap string `json:"tap"` + MAC string `json:"mac"` + NumQs int `json:"num_queues,omitempty"` + QueueS int `json:"queue_size,omitempty"` +} + +type chBalloon struct { + Size int64 `json:"size"` + DeflateOnOOM bool `json:"deflate_on_oom"` + FreePageRep bool `json:"free_page_reporting,omitempty"` +} + +type chConsole struct { + Mode string `json:"mode"` +} + +type chCreatePayload struct { + Payload chPayload `json:"payload"` + CPUs chCPUs `json:"cpus"` + Memory chMemory `json:"memory"` + Disks []chDisk `json:"disks"` + Net []chNet `json:"net"` + Balloon *chBalloon `json:"balloon,omitempty"` + Serial chConsole `json:"serial"` + Console chConsole `json:"console"` +} + +// createVM sends the full VM configuration as a single payload. +func (c *chClient) createVM(ctx context.Context, cfg *VMConfig) error { + memBytes := uint64(cfg.MemoryMB) * 1024 * 1024 + + payload := chCreatePayload{ + Payload: chPayload{ + Kernel: cfg.KernelPath, + Cmdline: cfg.kernelArgs(), + }, + CPUs: chCPUs{ + BootVCPUs: cfg.VCPUs, + MaxVCPUs: cfg.VCPUs, + }, + Memory: chMemory{ + Size: memBytes, + Shared: true, + }, + Disks: []chDisk{ + { + Path: cfg.SandboxDir + "/rootfs.ext4", + ImageType: "Raw", + }, + }, + Net: []chNet{ + { + Tap: cfg.TapDevice, + MAC: cfg.TapMAC, + }, + }, + Balloon: &chBalloon{ + Size: 0, + DeflateOnOOM: true, + FreePageRep: true, + }, + Serial: chConsole{ + Mode: "Tty", + }, + Console: chConsole{ + Mode: "Off", + }, + } + + return c.do(ctx, http.MethodPut, "/api/v1/vm.create", payload) +} + +// bootVM starts the VM after creation. +func (c *chClient) bootVM(ctx context.Context) error { + return c.do(ctx, http.MethodPut, "/api/v1/vm.boot", nil) +} + +// pauseVM pauses the microVM. +func (c *chClient) pauseVM(ctx context.Context) error { + return c.do(ctx, http.MethodPut, "/api/v1/vm.pause", nil) +} + +// resumeVM resumes a paused microVM. +func (c *chClient) resumeVM(ctx context.Context) error { + return c.do(ctx, http.MethodPut, "/api/v1/vm.resume", nil) +} + +// snapshotVM creates a VM snapshot to the given directory. +func (c *chClient) snapshotVM(ctx context.Context, destURL string) error { + return c.do(ctx, http.MethodPut, "/api/v1/vm.snapshot", map[string]string{ + "destination_url": destURL, + }) +} + +// restoreVM restores a VM from a snapshot via the API. Uses OnDemand memory +// restore mode for UFFD-based lazy page loading — only pages the guest +// actually touches are faulted in from disk. +func (c *chClient) restoreVM(ctx context.Context, sourceURL string) error { + return c.do(ctx, http.MethodPut, "/api/v1/vm.restore", map[string]any{ + "source_url": sourceURL, + "memory_restore_mode": "OnDemand", + "resume": true, + }) +} + +// shutdownVMM cleanly shuts down the Cloud Hypervisor VMM process. +func (c *chClient) shutdownVMM(ctx context.Context) error { + return c.do(ctx, http.MethodPut, "/api/v1/vmm.shutdown", nil) +} + +// resizeBalloon adjusts the balloon target at runtime. +// sizeBytes is memory to take FROM the guest (0 = give all back). +func (c *chClient) resizeBalloon(ctx context.Context, sizeBytes int64) error { + return c.do(ctx, http.MethodPut, "/api/v1/vm.resize", map[string]int64{ + "desired_balloon": sizeBytes, + }) +} + +// ping checks if the VMM is alive and ready to accept commands. +func (c *chClient) ping(ctx context.Context) error { + return c.do(ctx, http.MethodGet, "/api/v1/vmm.ping", nil) +} diff --git a/internal/vm/config.go b/internal/vm/config.go index ea229b6..f3c2341 100644 --- a/internal/vm/config.go +++ b/internal/vm/config.go @@ -2,13 +2,12 @@ package vm import "fmt" -// VMConfig holds the configuration for creating a Firecracker microVM. +// VMConfig holds the configuration for creating a Cloud Hypervisor microVM. type VMConfig struct { // SandboxID is the unique identifier for this sandbox (e.g., "cl-a1b2c3d4"). SandboxID string - // TemplateID is the template UUID string used to populate MMDS metadata - // so that envd can read WRENN_TEMPLATE_ID from inside the guest. + // TemplateID is the template UUID string, passed to envd via PostInit. TemplateID string // KernelPath is the path to the uncompressed Linux kernel (vmlinux). @@ -25,12 +24,12 @@ type VMConfig struct { MemoryMB int // NetworkNamespace is the name of the network namespace to launch - // Firecracker inside (e.g., "ns-1"). The namespace must already exist + // Cloud Hypervisor inside (e.g., "ns-1"). The namespace must already exist // with a TAP device configured. NetworkNamespace string // TapDevice is the name of the TAP device inside the network namespace - // that Firecracker will attach to (e.g., "tap0"). + // that Cloud Hypervisor will attach to (e.g., "tap0"). TapDevice string // TapMAC is the MAC address for the TAP device. @@ -45,19 +44,23 @@ type VMConfig struct { // NetMask is the subnet mask for the guest network (e.g., "255.255.255.252"). NetMask string - // FirecrackerBin is the path to the firecracker binary. - FirecrackerBin string + // VMMBin is the path to the cloud-hypervisor binary. + VMMBin string - // SocketPath is the path for the Firecracker API Unix socket. + // SocketPath is the path for the Cloud Hypervisor API Unix socket. SocketPath string // SandboxDir is the tmpfs mount point for per-sandbox files inside the - // mount namespace (e.g., "/fc-vm"). + // mount namespace (e.g., "/ch-vm"). SandboxDir string // InitPath is the path to the init process inside the guest. // Defaults to "/sbin/init" if empty. InitPath string + + // SnapshotDir is the path to the snapshot directory for restore. + // Only set when restoring from a snapshot. + SnapshotDir string } func (c *VMConfig) applyDefaults() { @@ -67,14 +70,14 @@ func (c *VMConfig) applyDefaults() { if c.MemoryMB == 0 { c.MemoryMB = 512 } - if c.FirecrackerBin == "" { - c.FirecrackerBin = "/usr/local/bin/firecracker" + if c.VMMBin == "" { + c.VMMBin = "/usr/local/bin/cloud-hypervisor" } if c.SocketPath == "" { - c.SocketPath = fmt.Sprintf("/tmp/fc-%s.sock", c.SandboxID) + c.SocketPath = fmt.Sprintf("/tmp/ch-%s.sock", c.SandboxID) } if c.SandboxDir == "" { - c.SandboxDir = "/tmp/fc-vm" + c.SandboxDir = "/tmp/ch-vm" } if c.TapDevice == "" { c.TapDevice = "tap0" @@ -95,7 +98,7 @@ func (c *VMConfig) kernelArgs() string { ) return fmt.Sprintf( - "console=ttyS0 reboot=k panic=1 pci=off quiet loglevel=1 clocksource=kvm-clock init=%s %s", + "console=ttyS0 root=/dev/vda rw reboot=k panic=1 quiet loglevel=1 init_on_free=1 clocksource=kvm-clock init=%s %s", c.InitPath, ipArg, ) } diff --git a/internal/vm/fc.go b/internal/vm/fc.go deleted file mode 100644 index 3f6204b..0000000 --- a/internal/vm/fc.go +++ /dev/null @@ -1,202 +0,0 @@ -package vm - -import ( - "bytes" - "context" - "encoding/json" - "fmt" - "io" - "net" - "net/http" -) - -// fcClient talks to the Firecracker HTTP API over a Unix socket. -type fcClient struct { - http *http.Client - socketPath string -} - -func newFCClient(socketPath string) *fcClient { - return &fcClient{ - socketPath: socketPath, - http: &http.Client{ - Transport: &http.Transport{ - DialContext: func(ctx context.Context, _, _ string) (net.Conn, error) { - var d net.Dialer - return d.DialContext(ctx, "unix", socketPath) - }, - }, - // No global timeout — callers pass context.Context with appropriate - // deadlines. A fixed 10s timeout was too short for snapshot/resume - // operations on large-memory VMs (20GB+ memfiles). - }, - } -} - -func (c *fcClient) do(ctx context.Context, method, path string, body any) error { - var bodyReader io.Reader - if body != nil { - data, err := json.Marshal(body) - if err != nil { - return fmt.Errorf("marshal request body: %w", err) - } - bodyReader = bytes.NewReader(data) - } - - // The host in the URL is ignored for Unix sockets; we use "localhost" by convention. - req, err := http.NewRequestWithContext(ctx, method, "http://localhost"+path, bodyReader) - if err != nil { - return fmt.Errorf("create request: %w", err) - } - if body != nil { - req.Header.Set("Content-Type", "application/json") - } - - resp, err := c.http.Do(req) - if err != nil { - return fmt.Errorf("%s %s: %w", method, path, err) - } - defer resp.Body.Close() - - if resp.StatusCode >= 300 { - respBody, _ := io.ReadAll(resp.Body) - return fmt.Errorf("%s %s: status %d: %s", method, path, resp.StatusCode, string(respBody)) - } - - return nil -} - -// setBootSource configures the kernel and boot args. -func (c *fcClient) setBootSource(ctx context.Context, kernelPath, bootArgs string) error { - return c.do(ctx, http.MethodPut, "/boot-source", map[string]string{ - "kernel_image_path": kernelPath, - "boot_args": bootArgs, - }) -} - -// setRootfsDrive configures the root filesystem drive. -func (c *fcClient) setRootfsDrive(ctx context.Context, driveID, path string, readOnly bool) error { - return c.do(ctx, http.MethodPut, "/drives/"+driveID, map[string]any{ - "drive_id": driveID, - "path_on_host": path, - "is_root_device": true, - "is_read_only": readOnly, - }) -} - -// setNetworkInterface configures a network interface attached to a TAP device. -// A tx_rate_limiter caps sustained guest→host throughput to prevent user -// application traffic from completely saturating the TAP device and starving -// envd control traffic (PTY, exec, file ops). -func (c *fcClient) setNetworkInterface(ctx context.Context, ifaceID, tapName, macAddr string) error { - return c.do(ctx, http.MethodPut, "/network-interfaces/"+ifaceID, map[string]any{ - "iface_id": ifaceID, - "host_dev_name": tapName, - "guest_mac": macAddr, - "tx_rate_limiter": map[string]any{ - "bandwidth": map[string]any{ - "size": 209715200, // 200 MB/s sustained - "refill_time": 1000, // refill period: 1 second - "one_time_burst": 104857600, // 100 MB initial burst - }, - }, - }) -} - -// setMachineConfig configures vCPUs, memory, and other machine settings. -func (c *fcClient) setMachineConfig(ctx context.Context, vcpus, memMB int) error { - return c.do(ctx, http.MethodPut, "/machine-config", map[string]any{ - "vcpu_count": vcpus, - "mem_size_mib": memMB, - "smt": false, - }) -} - -// setMMDSConfig enables MMDS V2 token-based access on the given network interface. -// Must be called before startVM. -func (c *fcClient) setMMDSConfig(ctx context.Context, ifaceID string) error { - return c.do(ctx, http.MethodPut, "/mmds/config", map[string]any{ - "version": "V2", - "network_interfaces": []string{ifaceID}, - }) -} - -// mmdsMetadata is the metadata payload written to the Firecracker MMDS store. -// envd reads this via PollForMMDSOpts to populate WRENN_SANDBOX_ID and WRENN_TEMPLATE_ID. -type mmdsMetadata struct { - SandboxID string `json:"instanceID"` - TemplateID string `json:"envID"` -} - -// setMMDS writes sandbox metadata to the Firecracker MMDS store. -// Can be called after the VM has started. -func (c *fcClient) setMMDS(ctx context.Context, sandboxID, templateID string) error { - return c.do(ctx, http.MethodPut, "/mmds", mmdsMetadata{ - SandboxID: sandboxID, - TemplateID: templateID, - }) -} - -// setBalloon configures the Firecracker balloon device for dynamic memory -// management. deflateOnOom lets the guest reclaim balloon pages under memory -// pressure. statsInterval enables periodic stats via GET /balloon/statistics. -// Must be called before startVM. -func (c *fcClient) setBalloon(ctx context.Context, amountMiB int, deflateOnOom bool, statsIntervalS int) error { - return c.do(ctx, http.MethodPut, "/balloon", map[string]any{ - "amount_mib": amountMiB, - "deflate_on_oom": deflateOnOom, - "stats_polling_interval_s": statsIntervalS, - }) -} - -// updateBalloon adjusts the balloon target at runtime. -func (c *fcClient) updateBalloon(ctx context.Context, amountMiB int) error { - return c.do(ctx, http.MethodPatch, "/balloon", map[string]any{ - "amount_mib": amountMiB, - }) -} - -// startVM issues the InstanceStart action. -func (c *fcClient) startVM(ctx context.Context) error { - return c.do(ctx, http.MethodPut, "/actions", map[string]string{ - "action_type": "InstanceStart", - }) -} - -// pauseVM pauses the microVM. -func (c *fcClient) pauseVM(ctx context.Context) error { - return c.do(ctx, http.MethodPatch, "/vm", map[string]string{ - "state": "Paused", - }) -} - -// resumeVM resumes a paused microVM. -func (c *fcClient) resumeVM(ctx context.Context) error { - return c.do(ctx, http.MethodPatch, "/vm", map[string]string{ - "state": "Resumed", - }) -} - -// createSnapshot creates a VM snapshot. -// snapshotType is "Full" (all memory) or "Diff" (only dirty pages since last resume). -func (c *fcClient) createSnapshot(ctx context.Context, snapPath, memPath, snapshotType string) error { - return c.do(ctx, http.MethodPut, "/snapshot/create", map[string]any{ - "snapshot_type": snapshotType, - "snapshot_path": snapPath, - "mem_file_path": memPath, - }) -} - -// loadSnapshotWithUffd loads a VM snapshot using a UFFD socket for -// lazy memory loading. Firecracker will connect to the socket and -// send the uffd fd + memory region mappings. -func (c *fcClient) loadSnapshotWithUffd(ctx context.Context, snapPath, uffdSocketPath string) error { - return c.do(ctx, http.MethodPut, "/snapshot/load", map[string]any{ - "snapshot_path": snapPath, - "resume_vm": false, - "mem_backend": map[string]any{ - "backend_type": "Uffd", - "backend_path": uffdSocketPath, - }, - }) -} diff --git a/internal/vm/manager.go b/internal/vm/manager.go index 3d55620..5f8539e 100644 --- a/internal/vm/manager.go +++ b/internal/vm/manager.go @@ -9,14 +9,14 @@ import ( "time" ) -// VM represents a running Firecracker microVM. +// VM represents a running Cloud Hypervisor microVM. type VM struct { Config VMConfig process *process - client *fcClient + client *chClient } -// Manager handles the lifecycle of Firecracker microVMs. +// Manager handles the lifecycle of Cloud Hypervisor microVMs. type Manager struct { mu sync.RWMutex // vms tracks running VMs by sandbox ID. @@ -30,7 +30,7 @@ func NewManager() *Manager { } } -// Create boots a new Firecracker microVM with the given configuration. +// Create boots a new Cloud Hypervisor microVM with the given configuration. // The network namespace and TAP device must already be set up. func (m *Manager) Create(ctx context.Context, cfg VMConfig) (*VM, error) { cfg.applyDefaults() @@ -38,7 +38,6 @@ func (m *Manager) Create(ctx context.Context, cfg VMConfig) (*VM, error) { return nil, fmt.Errorf("invalid config: %w", err) } - // Clean up any leftover socket from a previous run. os.Remove(cfg.SocketPath) slog.Info("creating VM", @@ -47,7 +46,7 @@ func (m *Manager) Create(ctx context.Context, cfg VMConfig) (*VM, error) { "memory_mb", cfg.MemoryMB, ) - // Step 1: Launch the Firecracker process. + // Step 1: Launch the Cloud Hypervisor process. proc, err := startProcess(ctx, &cfg) if err != nil { return nil, fmt.Errorf("start process: %w", err) @@ -59,25 +58,18 @@ func (m *Manager) Create(ctx context.Context, cfg VMConfig) (*VM, error) { return nil, fmt.Errorf("wait for socket: %w", err) } - // Step 3: Configure the VM via the Firecracker API. - client := newFCClient(cfg.SocketPath) + // Step 3: Configure and boot the VM via a single API call. + client := newCHClient(cfg.SocketPath) - if err := configureVM(ctx, client, &cfg); err != nil { + if err := client.createVM(ctx, &cfg); err != nil { _ = proc.stop() - return nil, fmt.Errorf("configure VM: %w", err) + return nil, fmt.Errorf("create VM config: %w", err) } - // Step 4: Start the VM. - if err := client.startVM(ctx); err != nil { + // Step 4: Boot the VM. + if err := client.bootVM(ctx); err != nil { _ = proc.stop() - return nil, fmt.Errorf("start VM: %w", err) - } - - // Step 5: Push sandbox metadata into MMDS so envd can read - // WRENN_SANDBOX_ID and WRENN_TEMPLATE_ID from inside the guest. - if err := client.setMMDS(ctx, cfg.SandboxID, cfg.TemplateID); err != nil { - _ = proc.stop() - return nil, fmt.Errorf("set MMDS metadata: %w", err) + return nil, fmt.Errorf("boot VM: %w", err) } vm := &VM{ @@ -95,46 +87,6 @@ func (m *Manager) Create(ctx context.Context, cfg VMConfig) (*VM, error) { return vm, nil } -// configureVM sends the configuration to Firecracker via its HTTP API. -func configureVM(ctx context.Context, client *fcClient, cfg *VMConfig) error { - // Boot source (kernel + args) - if err := client.setBootSource(ctx, cfg.KernelPath, cfg.kernelArgs()); err != nil { - return fmt.Errorf("set boot source: %w", err) - } - - // Root drive — use the symlink path inside the mount namespace so that - // snapshots record a stable path that works on restore. - rootfsSymlink := cfg.SandboxDir + "/rootfs.ext4" - if err := client.setRootfsDrive(ctx, "rootfs", rootfsSymlink, false); err != nil { - return fmt.Errorf("set rootfs drive: %w", err) - } - - // Network interface - if err := client.setNetworkInterface(ctx, "eth0", cfg.TapDevice, cfg.TapMAC); err != nil { - return fmt.Errorf("set network interface: %w", err) - } - - // Machine config (vCPUs + memory) - if err := client.setMachineConfig(ctx, cfg.VCPUs, cfg.MemoryMB); err != nil { - return fmt.Errorf("set machine config: %w", err) - } - - // Balloon device — allows the host to reclaim unused guest memory. - // Start with 0 (no inflation). deflate_on_oom lets the guest reclaim - // balloon pages under memory pressure. Stats interval enables monitoring. - if err := client.setBalloon(ctx, 0, true, 5); err != nil { - slog.Warn("set balloon failed (non-fatal, VM will run without memory reclaim)", "error", err) - } - - // MMDS config — enable V2 token access on eth0 so that envd can read - // WRENN_SANDBOX_ID and WRENN_TEMPLATE_ID from inside the guest. - if err := client.setMMDSConfig(ctx, "eth0"); err != nil { - return fmt.Errorf("set MMDS config: %w", err) - } - - return nil -} - // Pause pauses a running VM. func (m *Manager) Pause(ctx context.Context, sandboxID string) error { m.mu.RLock() @@ -179,7 +131,8 @@ func (m *Manager) UpdateBalloon(ctx context.Context, sandboxID string, amountMiB return fmt.Errorf("VM not found: %s", sandboxID) } - return vm.client.updateBalloon(ctx, amountMiB) + sizeBytes := int64(amountMiB) * 1024 * 1024 + return vm.client.resizeBalloon(ctx, sizeBytes) } // Destroy stops and cleans up a VM. @@ -195,12 +148,17 @@ func (m *Manager) Destroy(ctx context.Context, sandboxID string) error { slog.Info("destroying VM", "sandbox", sandboxID) - // Stop the Firecracker process. + // Try clean shutdown first, fall back to process kill. + shutdownCtx, shutdownCancel := context.WithTimeout(ctx, 5*time.Second) + if err := vm.client.shutdownVMM(shutdownCtx); err != nil { + slog.Debug("clean VMM shutdown failed, killing process", "sandbox", sandboxID, "error", err) + } + shutdownCancel() + if err := vm.process.stop(); err != nil { slog.Warn("error stopping process", "sandbox", sandboxID, "error", err) } - // Clean up the API socket. os.Remove(vm.Config.SocketPath) slog.Info("VM destroyed", "sandbox", sandboxID) @@ -208,8 +166,8 @@ func (m *Manager) Destroy(ctx context.Context, sandboxID string) error { } // Snapshot creates a VM snapshot. The VM must already be paused. -// snapshotType is "Full" (all memory) or "Diff" (only dirty pages since last resume). -func (m *Manager) Snapshot(ctx context.Context, sandboxID, snapPath, memPath, snapshotType string) error { +// destURL is the file:// URL to the snapshot directory. +func (m *Manager) Snapshot(ctx context.Context, sandboxID, snapshotDir string) error { m.mu.RLock() vm, ok := m.vms[sandboxID] m.mu.RUnlock() @@ -217,29 +175,35 @@ func (m *Manager) Snapshot(ctx context.Context, sandboxID, snapPath, memPath, sn return fmt.Errorf("VM not found: %s", sandboxID) } - if err := vm.client.createSnapshot(ctx, snapPath, memPath, snapshotType); err != nil { + destURL := "file://" + snapshotDir + if err := vm.client.snapshotVM(ctx, destURL); err != nil { return fmt.Errorf("create snapshot: %w", err) } - slog.Info("VM snapshot created", "sandbox", sandboxID, "snap_path", snapPath, "type", snapshotType) + slog.Info("VM snapshot created", "sandbox", sandboxID, "snapshot_dir", snapshotDir) return nil } -// CreateFromSnapshot boots a new Firecracker VM by loading a snapshot -// using UFFD for lazy memory loading. The network namespace and TAP -// device must already be set up. +// CreateFromSnapshot boots a new Cloud Hypervisor VM by restoring from a +// snapshot directory. The network namespace and TAP device must already be set up. // -// No boot resources (kernel, drives, machine config) are configured — -// the snapshot carries all that state. The rootfs path recorded in the -// snapshot is resolved via a stable symlink at SandboxDir/rootfs.ext4 -// inside the mount namespace (created by the start script in jailer.go). +// A bare CH process is started first, then the restore is performed via the API +// with memory_restore_mode=OnDemand for UFFD-based lazy page loading. This means +// only pages the guest actually touches are faulted in from disk — a 16GB template +// with 2GB active working set only loads ~2GB into RAM at restore time. +// +// The restore API also sets resume=true, so the VM starts running immediately +// without a separate resume call. +// +// The rootfs path recorded in the snapshot is resolved via a stable symlink at +// SandboxDir/rootfs.ext4 inside the mount namespace. // // The sequence is: -// 1. Start FC process in mount+network namespace (creates tmpfs + rootfs symlink) +// 1. Start bare CH process in mount+network namespace // 2. Wait for API socket -// 3. Load snapshot with UFFD backend -// 4. Resume VM execution -func (m *Manager) CreateFromSnapshot(ctx context.Context, cfg VMConfig, snapPath, uffdSocketPath string) (*VM, error) { +// 3. Restore VM via API (OnDemand memory + auto-resume) +func (m *Manager) CreateFromSnapshot(ctx context.Context, cfg VMConfig, snapshotDir string) (*VM, error) { + cfg.SnapshotDir = snapshotDir cfg.applyDefaults() if err := cfg.validate(); err != nil { return nil, fmt.Errorf("invalid config: %w", err) @@ -249,14 +213,11 @@ func (m *Manager) CreateFromSnapshot(ctx context.Context, cfg VMConfig, snapPath slog.Info("restoring VM from snapshot", "sandbox", cfg.SandboxID, - "snap_path", snapPath, + "snapshot_dir", snapshotDir, ) - // Step 1: Launch the Firecracker process. - // The start script creates a tmpfs at SandboxDir and symlinks - // rootfs.ext4 → cfg.RootfsPath, so the snapshot's recorded rootfs - // path (/fc-vm/rootfs.ext4) resolves to the new clone. - proc, err := startProcess(ctx, &cfg) + // Step 1: Launch bare CH process (no --restore). + proc, err := startProcessForRestore(ctx, &cfg) if err != nil { return nil, fmt.Errorf("start process: %w", err) } @@ -267,26 +228,13 @@ func (m *Manager) CreateFromSnapshot(ctx context.Context, cfg VMConfig, snapPath return nil, fmt.Errorf("wait for socket: %w", err) } - client := newFCClient(cfg.SocketPath) + client := newCHClient(cfg.SocketPath) - // Step 3: Load the snapshot with UFFD backend. - // No boot resources are configured — the snapshot carries kernel, - // drive, network, and machine config state. - if err := client.loadSnapshotWithUffd(ctx, snapPath, uffdSocketPath); err != nil { + // Step 3: Restore via API with OnDemand memory + auto-resume. + sourceURL := "file://" + snapshotDir + if err := client.restoreVM(ctx, sourceURL); err != nil { _ = proc.stop() - return nil, fmt.Errorf("load snapshot: %w", err) - } - - // Step 4: Resume the VM. - if err := client.resumeVM(ctx); err != nil { - _ = proc.stop() - return nil, fmt.Errorf("resume VM: %w", err) - } - - // Step 5: Push sandbox metadata into MMDS. - if err := client.setMMDS(ctx, cfg.SandboxID, cfg.TemplateID); err != nil { - _ = proc.stop() - return nil, fmt.Errorf("set MMDS metadata: %w", err) + return nil, fmt.Errorf("restore VM: %w", err) } vm := &VM{ @@ -304,11 +252,15 @@ func (m *Manager) CreateFromSnapshot(ctx context.Context, cfg VMConfig, snapPath } // PID returns the process ID of the unshare wrapper process. -// The actual Firecracker process is a direct child of this PID. func (v *VM) PID() int { return v.process.cmd.Process.Pid } +// Exited returns a channel that is closed when the VM process exits. +func (v *VM) Exited() <-chan struct{} { + return v.process.exited() +} + // Get returns a running VM by sandbox ID. func (m *Manager) Get(sandboxID string) (*VM, bool) { m.mu.RLock() @@ -317,7 +269,7 @@ func (m *Manager) Get(sandboxID string) (*VM, bool) { return vm, ok } -// waitForSocket polls for the Firecracker API socket to appear on disk. +// waitForSocket polls for the Cloud Hypervisor API socket to appear on disk. func waitForSocket(ctx context.Context, socketPath string, proc *process) error { ticker := time.NewTicker(10 * time.Millisecond) defer ticker.Stop() @@ -329,7 +281,7 @@ func waitForSocket(ctx context.Context, socketPath string, proc *process) error case <-ctx.Done(): return ctx.Err() case <-proc.exited(): - return fmt.Errorf("firecracker process exited before socket was ready") + return fmt.Errorf("cloud-hypervisor process exited before socket was ready") case <-timeout: return fmt.Errorf("timed out waiting for API socket at %s", socketPath) case <-ticker.C: diff --git a/internal/vm/jailer.go b/internal/vm/process.go similarity index 53% rename from internal/vm/jailer.go rename to internal/vm/process.go index e528540..da0d6e2 100644 --- a/internal/vm/jailer.go +++ b/internal/vm/process.go @@ -10,7 +10,7 @@ import ( "time" ) -// process represents a running Firecracker process with mount and network +// process represents a running Cloud Hypervisor process with mount and network // namespace isolation. type process struct { cmd *exec.Cmd @@ -20,33 +20,42 @@ type process struct { exitErr error } -// startProcess launches the Firecracker binary inside an isolated mount namespace -// and the specified network namespace. The launch sequence: +// startProcess launches the Cloud Hypervisor binary inside an isolated mount +// namespace and the specified network namespace. Used for fresh boot (no +// snapshot). The launch sequence: // // 1. unshare -m: creates a private mount namespace // 2. mount --make-rprivate /: prevents mount propagation to host // 3. mount tmpfs at SandboxDir: ephemeral workspace for this VM // 4. symlink kernel and rootfs into SandboxDir // 5. ip netns exec : enters the network namespace where TAP is configured -// 6. exec firecracker with the API socket path +// 6. exec cloud-hypervisor with the API socket path func startProcess(ctx context.Context, cfg *VMConfig) (*process, error) { - // Use a background context for the long-lived Firecracker process. - // The request context (ctx) is only used for the startup phase — we must - // not tie the VM's lifetime to the HTTP request that created it. - execCtx, cancel := context.WithCancel(context.Background()) - script := buildStartScript(cfg) + return launchScript(script, cfg) +} + +// startProcessForRestore launches a bare Cloud Hypervisor process (no --restore). +// The restore is performed via the API after the socket is ready, which allows +// passing memory_restore_mode=OnDemand for UFFD lazy paging. +func startProcessForRestore(ctx context.Context, cfg *VMConfig) (*process, error) { + script := buildRestoreScript(cfg) + return launchScript(script, cfg) +} + +func launchScript(script string, cfg *VMConfig) (*process, error) { + execCtx, cancel := context.WithCancel(context.Background()) cmd := exec.CommandContext(execCtx, "unshare", "-m", "--", "bash", "-c", script) cmd.SysProcAttr = &syscall.SysProcAttr{ - Setsid: true, // new session so signals don't propagate from parent + Setsid: true, } cmd.Stdout = os.Stdout cmd.Stderr = os.Stderr if err := cmd.Start(); err != nil { cancel() - return nil, fmt.Errorf("start firecracker process: %w", err) + return nil, fmt.Errorf("start cloud-hypervisor process: %w", err) } p := &process{ @@ -60,7 +69,7 @@ func startProcess(ctx context.Context, cfg *VMConfig) (*process, error) { close(p.exitCh) }() - slog.Info("firecracker process started", + slog.Info("cloud-hypervisor process started", "pid", cmd.Process.Pid, "sandbox", cfg.SandboxID, ) @@ -68,35 +77,56 @@ func startProcess(ctx context.Context, cfg *VMConfig) (*process, error) { return p, nil } -// buildStartScript generates the bash script that sets up the mount namespace, -// symlinks kernel/rootfs, and execs Firecracker inside the network namespace. +// buildStartScript generates the bash script for fresh boot: sets up mount +// namespace, symlinks kernel/rootfs, and execs Cloud Hypervisor. func buildStartScript(cfg *VMConfig) string { return fmt.Sprintf(` set -euo pipefail -# Prevent mount propagation to the host mount --make-rprivate / -# Create ephemeral tmpfs workspace mkdir -p %[1]s mount -t tmpfs tmpfs %[1]s -# Symlink kernel and rootfs into the workspace ln -s %[2]s %[1]s/vmlinux ln -s %[3]s %[1]s/rootfs.ext4 -# Launch Firecracker inside the network namespace -exec ip netns exec %[4]s %[5]s --api-sock %[6]s +exec ip netns exec %[4]s %[5]s --api-socket path=%[6]s `, cfg.SandboxDir, // 1 cfg.KernelPath, // 2 cfg.RootfsPath, // 3 cfg.NetworkNamespace, // 4 - cfg.FirecrackerBin, // 5 + cfg.VMMBin, // 5 cfg.SocketPath, // 6 ) } +// buildRestoreScript generates the bash script for snapshot restore: sets up +// mount namespace, symlinks rootfs, and starts a bare Cloud Hypervisor process. +// The actual restore is done via the API (PUT /vm.restore) after the socket is +// ready, which enables memory_restore_mode=OnDemand for UFFD lazy paging. +func buildRestoreScript(cfg *VMConfig) string { + return fmt.Sprintf(` +set -euo pipefail + +mount --make-rprivate / + +mkdir -p %[1]s +mount -t tmpfs tmpfs %[1]s + +ln -s %[2]s %[1]s/rootfs.ext4 + +exec ip netns exec %[3]s %[4]s --api-socket path=%[5]s +`, + cfg.SandboxDir, // 1 + cfg.RootfsPath, // 2 + cfg.NetworkNamespace, // 3 + cfg.VMMBin, // 4 + cfg.SocketPath, // 5 + ) +} + // stop sends SIGTERM and waits for the process to exit. If it doesn't exit // within 10 seconds, SIGKILL is sent. func (p *process) stop() error { @@ -104,7 +134,6 @@ func (p *process) stop() error { return nil } - // Send SIGTERM to the process group (negative PID). if err := syscall.Kill(-p.cmd.Process.Pid, syscall.SIGTERM); err != nil { slog.Debug("sigterm failed, process may have exited", "error", err) } @@ -113,7 +142,7 @@ func (p *process) stop() error { case <-p.exitCh: return nil case <-time.After(10 * time.Second): - slog.Warn("firecracker did not exit after SIGTERM, sending SIGKILL") + slog.Warn("cloud-hypervisor did not exit after SIGTERM, sending SIGKILL") if err := syscall.Kill(-p.cmd.Process.Pid, syscall.SIGKILL); err != nil { slog.Debug("sigkill failed", "error", err) } diff --git a/pkg/cpserver/run.go b/pkg/cpserver/run.go index 7ef47d1..1fbf6eb 100644 --- a/pkg/cpserver/run.go +++ b/pkg/cpserver/run.go @@ -177,10 +177,11 @@ func Run(opts ...Option) { Config: cfg, } - // Host monitor (passive + active reconciliation every 60s). - // Created before API server so the heartbeat handler can trigger immediate - // reconciliation when a host recovers from unreachable. - monitor := api.NewHostMonitor(queries, hostPool, al, 60*time.Second) + // Host monitor (safety-net reconciliation every 5 minutes). + // Primary state sync is push-based (host agent callbacks + CP background + // goroutines). The monitor acts as a fallback for missed events, host death + // detection, and transient status resolution. + monitor := api.NewHostMonitor(queries, hostPool, al, 5*time.Minute) // API server. srv := api.New(queries, hostPool, hostScheduler, pool, rdb, []byte(cfg.JWTSecret), oauthRegistry, cfg.OAuthRedirectURL, ca, al, channelSvc, mailer, o.extensions, sctx, monitor, o.version) diff --git a/pkg/service/build.go b/pkg/service/build.go index bdb1620..dee639c 100644 --- a/pkg/service/build.go +++ b/pkg/service/build.go @@ -326,7 +326,7 @@ func (s *BuildService) executeBuild(ctx context.Context, buildIDStr string) { s.failBuild(buildCtx, buildID, fmt.Sprintf("create sandbox failed: %v", err)) return } - // Capture sandbox metadata (envd/kernel/firecracker/agent versions). + // Capture sandbox metadata (envd/kernel/vmm/agent versions). sandboxMetadata := resp.Msg.Metadata // Record sandbox/host association. @@ -768,7 +768,7 @@ var runtimeEnvVars = map[string]bool{ "HOME": true, "USER": true, "LOGNAME": true, "SHELL": true, "PWD": true, "OLDPWD": true, "HOSTNAME": true, "TERM": true, "SHLVL": true, "_": true, - // Per-sandbox identifiers set by envd at boot via MMDS. + // Per-sandbox identifiers set by envd at boot via PostInit. "WRENN_SANDBOX_ID": true, "WRENN_TEMPLATE_ID": true, }