forked from wrenn/wrenn
feat(vm): replace Firecracker with Cloud Hypervisor
Migrate the entire VM layer from Firecracker to Cloud Hypervisor (CH). CH provides native snapshot/restore via its HTTP API, eliminating the need for custom UFFD handling, memfile processing, and snapshot header management that Firecracker required. Key changes: - Remove fc.go, jailer.go (FC process management) - Remove internal/uffd/ package (userfaultfd lazy page loading) - Remove snapshot/header.go, mapping.go, memfile.go (FC snapshot format) - Add ch.go (CH HTTP API client over Unix socket) - Add process.go (CH process lifecycle with unshare+netns) - Add chversion.go (CH version detection) - Refactor sandbox manager: remove UFFD socket tracking, snapshot parent/diff chaining, FC-specific balloon logic; add crash watcher - Simplify snapshot/local.go to CH's native snapshot format - Update VM config: FirecrackerBin → VMMBin, new CH-specific fields - Update envdclient, devicemapper, network for CH compatibility
This commit is contained in:
@ -126,22 +126,22 @@ func main() {
|
||||
}
|
||||
slog.Info("resolved kernel", "version", kernelVersion, "path", kernelPath)
|
||||
|
||||
// Detect firecracker version.
|
||||
fcBin := envOrDefault("WRENN_FIRECRACKER_BIN", "/usr/local/bin/firecracker")
|
||||
fcVersion, err := sandbox.DetectFirecrackerVersion(fcBin)
|
||||
// Detect cloud-hypervisor version.
|
||||
chBin := envOrDefault("WRENN_CH_BIN", "/usr/local/bin/cloud-hypervisor")
|
||||
chVersion, err := sandbox.DetectCHVersion(chBin)
|
||||
if err != nil {
|
||||
slog.Error("failed to detect firecracker version", "error", err)
|
||||
slog.Error("failed to detect cloud-hypervisor version", "error", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
slog.Info("resolved firecracker", "version", fcVersion, "path", fcBin)
|
||||
slog.Info("resolved cloud-hypervisor", "version", chVersion, "path", chBin)
|
||||
|
||||
cfg := sandbox.Config{
|
||||
WrennDir: rootDir,
|
||||
DefaultRootfsSizeMB: defaultRootfsSizeMB,
|
||||
KernelPath: kernelPath,
|
||||
KernelVersion: kernelVersion,
|
||||
FirecrackerBin: fcBin,
|
||||
FirecrackerVersion: fcVersion,
|
||||
VMMBin: chBin,
|
||||
VMMVersion: chVersion,
|
||||
AgentVersion: version,
|
||||
}
|
||||
|
||||
@ -245,12 +245,16 @@ func main() {
|
||||
},
|
||||
)
|
||||
|
||||
// Graceful shutdown on SIGINT/SIGTERM.
|
||||
// Graceful shutdown on SIGINT/SIGTERM. A second signal force-exits
|
||||
// so the operator can always kill the process if shutdown hangs.
|
||||
sigCh := make(chan os.Signal, 1)
|
||||
signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM)
|
||||
go func() {
|
||||
sig := <-sigCh
|
||||
doShutdown("signal: " + sig.String())
|
||||
go doShutdown("signal: " + sig.String())
|
||||
sig = <-sigCh
|
||||
slog.Error("received second signal, force exiting", "signal", sig.String())
|
||||
os.Exit(1)
|
||||
}()
|
||||
|
||||
slog.Info("host agent starting", "addr", listenAddr, "host_id", creds.HostID, "version", version, "commit", commit)
|
||||
@ -292,7 +296,7 @@ func checkPrivileges() error {
|
||||
name string
|
||||
}{
|
||||
{1, "CAP_DAC_OVERRIDE"}, // /dev/loop*, /dev/mapper/*, /dev/net/tun
|
||||
{5, "CAP_KILL"}, // SIGTERM/SIGKILL to Firecracker processes
|
||||
{5, "CAP_KILL"}, // SIGTERM/SIGKILL to cloud-hypervisor processes
|
||||
{12, "CAP_NET_ADMIN"}, // netlink, iptables, routing, TAP/veth
|
||||
{13, "CAP_NET_RAW"}, // raw sockets (iptables)
|
||||
{19, "CAP_SYS_PTRACE"}, // reading /proc/self/ns/net (netns.Get)
|
||||
|
||||
@ -19,6 +19,12 @@ import (
|
||||
// it is considered unreachable (3 missed 30-second heartbeats).
|
||||
const unreachableThreshold = 90 * time.Second
|
||||
|
||||
// transientGracePeriod is how long a sandbox is allowed to stay in a transient
|
||||
// status (starting, resuming, pausing, stopping) before the monitor infers a
|
||||
// final state. This prevents the monitor from racing against in-flight RPCs
|
||||
// that may not have registered the sandbox on the host agent yet.
|
||||
const transientGracePeriod = 2 * time.Minute
|
||||
|
||||
// HostMonitor runs on a fixed interval and performs two duties:
|
||||
//
|
||||
// 1. Passive check: marks hosts whose last_heartbeat_at is stale as
|
||||
@ -257,7 +263,16 @@ func (m *HostMonitor) checkHost(ctx context.Context, host db.Host) {
|
||||
}
|
||||
continue
|
||||
}
|
||||
// Sandbox is not alive on host — infer final state.
|
||||
// Sandbox is not alive on host. If the transition is recent, give the
|
||||
// in-flight RPC time to finish before declaring a final state.
|
||||
if sb.LastUpdated.Valid && time.Since(sb.LastUpdated.Time) < transientGracePeriod {
|
||||
slog.Debug("host monitor: transient sandbox still within grace period",
|
||||
"sandbox_id", sbIDStr, "status", sb.Status,
|
||||
"age", time.Since(sb.LastUpdated.Time).Round(time.Second))
|
||||
continue
|
||||
}
|
||||
|
||||
// Grace period expired — infer final state.
|
||||
var finalStatus string
|
||||
switch sb.Status {
|
||||
case "starting", "resuming":
|
||||
|
||||
@ -42,6 +42,7 @@ const (
|
||||
SandboxEventResumed = "sandbox.resumed"
|
||||
SandboxEventStopped = "sandbox.stopped"
|
||||
SandboxEventFailed = "sandbox.failed"
|
||||
SandboxEventError = "sandbox.error"
|
||||
SandboxEventAutoPaused = "sandbox.auto_paused"
|
||||
)
|
||||
|
||||
@ -141,7 +142,7 @@ func (c *SandboxEventConsumer) handleMessage(ctx context.Context, msg redis.XMes
|
||||
c.handlePaused(ctx, sandboxID, event)
|
||||
case SandboxEventStopped:
|
||||
c.handleStopped(ctx, sandboxID, event)
|
||||
case SandboxEventFailed:
|
||||
case SandboxEventFailed, SandboxEventError:
|
||||
c.handleFailed(ctx, sandboxID)
|
||||
case SandboxEventAutoPaused:
|
||||
c.handleAutoPaused(ctx, sandboxID, event)
|
||||
@ -187,20 +188,39 @@ func (c *SandboxEventConsumer) handlePaused(ctx context.Context, sandboxID pgtyp
|
||||
}
|
||||
|
||||
func (c *SandboxEventConsumer) handleStopped(ctx context.Context, sandboxID pgtype.UUID, event SandboxEvent) {
|
||||
// Try stopping → stopped (CP-initiated destroy completed).
|
||||
if _, err := c.db.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{
|
||||
ID: sandboxID,
|
||||
Status: "stopping",
|
||||
Status_2: "stopped",
|
||||
}); err == nil {
|
||||
return
|
||||
}
|
||||
// Try running → stopped (autonomous destroy, e.g. TTL auto-destroy).
|
||||
if _, err := c.db.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{
|
||||
ID: sandboxID,
|
||||
Status: "running",
|
||||
Status_2: "stopped",
|
||||
}); err != nil && !errors.Is(err, pgx.ErrNoRows) {
|
||||
slog.Warn("sandbox event consumer: failed to update sandbox to stopped", "sandbox_id", event.SandboxID, "error", err)
|
||||
}
|
||||
}
|
||||
|
||||
// handleFailed is a no-op fallback — the background goroutine already
|
||||
// performed the conditional DB update before publishing this event.
|
||||
// We keep the case arm so unknown event types are flagged, but avoid
|
||||
// an unconditional status write that could clobber concurrent operations.
|
||||
func (c *SandboxEventConsumer) handleFailed(_ context.Context, _ pgtype.UUID) {}
|
||||
// handleFailed marks a sandbox as "error" when the host agent reports a crash
|
||||
// or the CP's background goroutine publishes a failure. Uses conditional update
|
||||
// to avoid clobbering concurrent operations.
|
||||
func (c *SandboxEventConsumer) handleFailed(ctx context.Context, sandboxID pgtype.UUID) {
|
||||
// Try running → error (VM crash pushed by host agent).
|
||||
if _, err := c.db.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{
|
||||
ID: sandboxID, Status: "running", Status_2: "error",
|
||||
}); err == nil {
|
||||
return
|
||||
}
|
||||
// Try starting → error (create failed).
|
||||
_, _ = c.db.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{
|
||||
ID: sandboxID, Status: "starting", Status_2: "error",
|
||||
})
|
||||
}
|
||||
|
||||
func (c *SandboxEventConsumer) handleAutoPaused(ctx context.Context, sandboxID pgtype.UUID, _ SandboxEvent) {
|
||||
sb, err := c.db.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{
|
||||
|
||||
@ -80,8 +80,8 @@ func (r *LoopRegistry) Release(imagePath string) {
|
||||
|
||||
e.refcount--
|
||||
if e.refcount <= 0 {
|
||||
if err := losetupDetach(e.device); err != nil {
|
||||
slog.Warn("losetup detach failed", "device", e.device, "error", err)
|
||||
if err := losetupDetachRetry(e.device); err != nil {
|
||||
slog.Error("losetup detach failed, loop device leaked", "device", e.device, "image", imagePath, "error", err)
|
||||
}
|
||||
delete(r.entries, imagePath)
|
||||
slog.Info("loop device released", "image", imagePath, "device", e.device)
|
||||
@ -94,8 +94,8 @@ func (r *LoopRegistry) ReleaseAll() {
|
||||
defer r.mu.Unlock()
|
||||
|
||||
for path, e := range r.entries {
|
||||
if err := losetupDetach(e.device); err != nil {
|
||||
slog.Warn("losetup detach failed", "device", e.device, "error", err)
|
||||
if err := losetupDetachRetry(e.device); err != nil {
|
||||
slog.Error("losetup detach failed during shutdown", "device", e.device, "image", path, "error", err)
|
||||
}
|
||||
delete(r.entries, path)
|
||||
}
|
||||
@ -134,8 +134,8 @@ func CreateSnapshot(name, originLoopDev, cowPath string, originSizeBytes, cowSiz
|
||||
// space to store all modified blocks (it's sparse, so 20GB costs nothing).
|
||||
sectors := originSizeBytes / 512
|
||||
if err := dmsetupCreate(name, originLoopDev, cowLoopDev, sectors); err != nil {
|
||||
if detachErr := losetupDetach(cowLoopDev); detachErr != nil {
|
||||
slog.Warn("cow losetup detach failed during cleanup", "device", cowLoopDev, "error", detachErr)
|
||||
if detachErr := losetupDetachRetry(cowLoopDev); detachErr != nil {
|
||||
slog.Error("cow losetup detach failed during cleanup, loop device leaked", "device", cowLoopDev, "error", detachErr)
|
||||
}
|
||||
os.Remove(cowPath)
|
||||
return nil, fmt.Errorf("dmsetup create: %w", err)
|
||||
@ -178,8 +178,8 @@ func RestoreSnapshot(ctx context.Context, name, originLoopDev, cowPath string, o
|
||||
|
||||
sectors := originSizeBytes / 512
|
||||
if err := dmsetupCreate(name, originLoopDev, cowLoopDev, sectors); err != nil {
|
||||
if detachErr := losetupDetach(cowLoopDev); detachErr != nil {
|
||||
slog.Warn("cow losetup detach failed during cleanup", "device", cowLoopDev, "error", detachErr)
|
||||
if detachErr := losetupDetachRetry(cowLoopDev); detachErr != nil {
|
||||
slog.Error("cow losetup detach failed during cleanup, loop device leaked", "device", cowLoopDev, "error", detachErr)
|
||||
}
|
||||
return nil, fmt.Errorf("dmsetup create: %w", err)
|
||||
}
|
||||
@ -208,8 +208,8 @@ func RemoveSnapshot(ctx context.Context, dev *SnapshotDevice) error {
|
||||
return fmt.Errorf("dmsetup remove %s: %w", dev.Name, err)
|
||||
}
|
||||
|
||||
if err := losetupDetach(dev.CowLoopDev); err != nil {
|
||||
slog.Warn("cow losetup detach failed", "device", dev.CowLoopDev, "error", err)
|
||||
if err := losetupDetachRetry(dev.CowLoopDev); err != nil {
|
||||
return fmt.Errorf("detach cow loop %s: %w", dev.CowLoopDev, err)
|
||||
}
|
||||
|
||||
slog.Info("dm-snapshot removed", "name", dev.Name)
|
||||
@ -297,6 +297,24 @@ func losetupDetach(dev string) error {
|
||||
return exec.Command("losetup", "-d", dev).Run()
|
||||
}
|
||||
|
||||
// losetupDetachRetry detaches a loop device with retries for transient
|
||||
// "device busy" errors (kernel may still hold references briefly after
|
||||
// dm-snapshot removal).
|
||||
func losetupDetachRetry(dev string) error {
|
||||
var lastErr error
|
||||
for attempt := range 5 {
|
||||
if attempt > 0 {
|
||||
time.Sleep(200 * time.Millisecond)
|
||||
}
|
||||
if err := losetupDetach(dev); err == nil {
|
||||
return nil
|
||||
} else {
|
||||
lastErr = err
|
||||
}
|
||||
}
|
||||
return fmt.Errorf("after 5 attempts: %w", lastErr)
|
||||
}
|
||||
|
||||
// dmsetupCreate creates a dm-snapshot device with persistent metadata.
|
||||
func dmsetupCreate(name, originDev, cowDev string, sectors int64) error {
|
||||
// Table format: <start> <size> snapshot <origin> <cow> P <chunk_size>
|
||||
@ -316,7 +334,7 @@ func dmDeviceExists(name string) bool {
|
||||
|
||||
// dmsetupRemove removes a device-mapper device, retrying on transient
|
||||
// "device busy" errors that occur when the kernel hasn't fully released
|
||||
// the device after a Firecracker process exits.
|
||||
// the device after a VMM process exits.
|
||||
func dmsetupRemove(ctx context.Context, name string) error {
|
||||
var lastErr error
|
||||
for attempt := range 5 {
|
||||
|
||||
@ -294,7 +294,7 @@ func (c *Client) ReadFile(ctx context.Context, path string) ([]byte, error) {
|
||||
|
||||
// PrepareSnapshot calls envd's POST /snapshot/prepare endpoint, which stops
|
||||
// the port scanner/forwarder and marks active connections for post-restore
|
||||
// cleanup before Firecracker freezes vCPUs.
|
||||
// cleanup before the VMM freezes vCPUs.
|
||||
//
|
||||
// Best-effort: the caller should log a warning on error but not abort the pause.
|
||||
func (c *Client) PrepareSnapshot(ctx context.Context) error {
|
||||
@ -317,27 +317,33 @@ func (c *Client) PrepareSnapshot(ctx context.Context) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// PostInit calls envd's POST /init endpoint, which triggers a re-read of
|
||||
// Firecracker MMDS metadata. This updates WRENN_SANDBOX_ID, WRENN_TEMPLATE_ID
|
||||
// env vars and the corresponding files under /run/wrenn/ inside the guest.
|
||||
// Must be called after snapshot restore so envd picks up the new sandbox's metadata.
|
||||
// PostInit calls envd's POST /init endpoint to trigger post-boot or
|
||||
// post-restore initialization. sandbox_id and template_id are passed
|
||||
// so envd can set WRENN_SANDBOX_ID and WRENN_TEMPLATE_ID env vars.
|
||||
func (c *Client) PostInit(ctx context.Context) error {
|
||||
return c.PostInitWithDefaults(ctx, "", nil)
|
||||
return c.PostInitWithDefaults(ctx, "", nil, "", "")
|
||||
}
|
||||
|
||||
// PostInitWithDefaults calls envd's POST /init endpoint with optional default
|
||||
// user and environment variables. These are applied to envd's defaults so all
|
||||
// subsequent process executions use them.
|
||||
func (c *Client) PostInitWithDefaults(ctx context.Context, defaultUser string, envVars map[string]string) error {
|
||||
// user, environment variables, and sandbox metadata. These are applied to
|
||||
// envd's defaults so all subsequent process executions use them.
|
||||
func (c *Client) PostInitWithDefaults(ctx context.Context, defaultUser string, envVars map[string]string, sandboxID, templateID string) error {
|
||||
payload := make(map[string]any)
|
||||
if defaultUser != "" {
|
||||
payload["defaultUser"] = defaultUser
|
||||
}
|
||||
if len(envVars) > 0 {
|
||||
payload["envVars"] = envVars
|
||||
}
|
||||
if sandboxID != "" {
|
||||
payload["sandbox_id"] = sandboxID
|
||||
}
|
||||
if templateID != "" {
|
||||
payload["template_id"] = templateID
|
||||
}
|
||||
|
||||
var body io.Reader
|
||||
if defaultUser != "" || len(envVars) > 0 {
|
||||
payload := make(map[string]any)
|
||||
if defaultUser != "" {
|
||||
payload["defaultUser"] = defaultUser
|
||||
}
|
||||
if len(envVars) > 0 {
|
||||
payload["envVars"] = envVars
|
||||
}
|
||||
if len(payload) > 0 {
|
||||
data, err := json.Marshal(payload)
|
||||
if err != nil {
|
||||
return fmt.Errorf("marshal init body: %w", err)
|
||||
|
||||
@ -46,7 +46,7 @@ func SandboxesDir(wrennDir string) string {
|
||||
return filepath.Join(wrennDir, "sandboxes")
|
||||
}
|
||||
|
||||
// KernelPath returns the path to the Firecracker kernel.
|
||||
// KernelPath returns the path to the VM kernel.
|
||||
func KernelPath(wrennDir string) string {
|
||||
return filepath.Join(wrennDir, "kernels", "vmlinux")
|
||||
}
|
||||
|
||||
@ -176,7 +176,7 @@ func NewSlot(index int) *Slot {
|
||||
// CreateNetwork sets up the full network topology for a sandbox:
|
||||
// - Named network namespace
|
||||
// - Veth pair bridging host and namespace
|
||||
// - TAP device inside namespace for Firecracker
|
||||
// - TAP device inside namespace for Cloud Hypervisor
|
||||
// - Routes and NAT rules for connectivity
|
||||
//
|
||||
// On error, all partially created resources are rolled back.
|
||||
|
||||
28
internal/sandbox/chversion.go
Normal file
28
internal/sandbox/chversion.go
Normal file
@ -0,0 +1,28 @@
|
||||
package sandbox
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os/exec"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// DetectCHVersion runs the cloud-hypervisor binary with --version and
|
||||
// parses the semver from the output (e.g. "cloud-hypervisor v43.0" → "43.0").
|
||||
func DetectCHVersion(binaryPath string) (string, error) {
|
||||
out, err := exec.Command(binaryPath, "--version").Output()
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("run %s --version: %w", binaryPath, err)
|
||||
}
|
||||
|
||||
line := strings.TrimSpace(string(out))
|
||||
for field := range strings.FieldsSeq(line) {
|
||||
v := strings.TrimPrefix(field, "v")
|
||||
if v != field || strings.Contains(field, ".") {
|
||||
if strings.Count(v, ".") >= 1 {
|
||||
return v, nil
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return "", fmt.Errorf("could not parse version from cloud-hypervisor output: %q", line)
|
||||
}
|
||||
@ -1,30 +0,0 @@
|
||||
package sandbox
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os/exec"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// DetectFirecrackerVersion runs the firecracker binary with --version and
|
||||
// parses the semver from the output (e.g. "Firecracker v1.14.1" → "1.14.1").
|
||||
func DetectFirecrackerVersion(binaryPath string) (string, error) {
|
||||
out, err := exec.Command(binaryPath, "--version").Output()
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("run %s --version: %w", binaryPath, err)
|
||||
}
|
||||
|
||||
// Output is typically "Firecracker v1.14.1\n" or similar.
|
||||
line := strings.TrimSpace(string(out))
|
||||
for _, field := range strings.Fields(line) {
|
||||
v := strings.TrimPrefix(field, "v")
|
||||
if v != field || strings.Contains(field, ".") {
|
||||
// Either had a "v" prefix or contains a dot — likely the version.
|
||||
if strings.Count(v, ".") >= 1 {
|
||||
return v, nil
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return "", fmt.Errorf("could not parse version from firecracker output: %q", line)
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@ -1,9 +1,11 @@
|
||||
package sandbox
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
@ -50,10 +52,15 @@ func readCPUStat(pid int) (cpuStat, error) {
|
||||
|
||||
// readEnvdMemUsed fetches mem_used from envd's /metrics endpoint. Returns
|
||||
// guest-side total - MemAvailable (actual process memory, excluding reclaimable
|
||||
// page cache). VmRSS of the Firecracker process includes guest page cache and
|
||||
// page cache). VmRSS of the VMM process includes guest page cache and
|
||||
// never decreases, so this is the accurate metric for dashboard display.
|
||||
func readEnvdMemUsed(client *envdclient.Client) (int64, error) {
|
||||
resp, err := client.HTTPClient().Get(client.BaseURL() + "/metrics")
|
||||
func readEnvdMemUsed(ctx context.Context, client *envdclient.Client) (int64, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, client.BaseURL()+"/metrics", nil)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("build metrics request: %w", err)
|
||||
}
|
||||
|
||||
resp, err := client.HTTPClient().Do(req)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("fetch envd metrics: %w", err)
|
||||
}
|
||||
|
||||
@ -1,221 +0,0 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// Modifications by M/S Omukk
|
||||
|
||||
// Package snapshot implements snapshot storage, header-based memory mapping,
|
||||
// and memory file processing for Firecracker VM snapshots.
|
||||
//
|
||||
// The header system implements a generational copy-on-write memory mapping.
|
||||
// Each snapshot generation stores only the blocks that changed since the
|
||||
// previous generation. A Header contains a sorted list of BuildMap entries
|
||||
// that together cover the entire memory address space, with each entry
|
||||
// pointing to a specific generation's diff file.
|
||||
package snapshot
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/binary"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
|
||||
"github.com/google/uuid"
|
||||
)
|
||||
|
||||
const metadataVersion = 1
|
||||
|
||||
// Metadata is the fixed-size header prefix describing the snapshot memory layout.
|
||||
// Binary layout (little-endian, 64 bytes total):
|
||||
//
|
||||
// Version uint64 (8 bytes)
|
||||
// BlockSize uint64 (8 bytes)
|
||||
// Size uint64 (8 bytes) — total memory size in bytes
|
||||
// Generation uint64 (8 bytes)
|
||||
// BuildID [16]byte (UUID)
|
||||
// BaseBuildID [16]byte (UUID)
|
||||
type Metadata struct {
|
||||
Version uint64
|
||||
BlockSize uint64
|
||||
Size uint64
|
||||
Generation uint64
|
||||
BuildID uuid.UUID
|
||||
BaseBuildID uuid.UUID
|
||||
}
|
||||
|
||||
// NewMetadata creates metadata for a first-generation snapshot.
|
||||
func NewMetadata(buildID uuid.UUID, blockSize, size uint64) *Metadata {
|
||||
return &Metadata{
|
||||
Version: metadataVersion,
|
||||
Generation: 0,
|
||||
BlockSize: blockSize,
|
||||
Size: size,
|
||||
BuildID: buildID,
|
||||
BaseBuildID: buildID,
|
||||
}
|
||||
}
|
||||
|
||||
// NextGeneration creates metadata for the next generation in the chain.
|
||||
func (m *Metadata) NextGeneration(buildID uuid.UUID) *Metadata {
|
||||
return &Metadata{
|
||||
Version: m.Version,
|
||||
Generation: m.Generation + 1,
|
||||
BlockSize: m.BlockSize,
|
||||
Size: m.Size,
|
||||
BuildID: buildID,
|
||||
BaseBuildID: m.BaseBuildID,
|
||||
}
|
||||
}
|
||||
|
||||
// BuildMap maps a contiguous range of the memory address space to a specific
|
||||
// generation's diff file. Binary layout (little-endian, 40 bytes):
|
||||
//
|
||||
// Offset uint64 — byte offset in the virtual address space
|
||||
// Length uint64 — byte count (multiple of BlockSize)
|
||||
// BuildID [16]byte — which generation's diff file, uuid.Nil = zero-fill
|
||||
// BuildStorageOffset uint64 — byte offset within that generation's diff file
|
||||
type BuildMap struct {
|
||||
Offset uint64
|
||||
Length uint64
|
||||
BuildID uuid.UUID
|
||||
BuildStorageOffset uint64
|
||||
}
|
||||
|
||||
// Header is the in-memory representation of a snapshot's memory mapping.
|
||||
// It provides O(log N) lookup from any memory offset to the correct
|
||||
// generation's diff file and offset within it.
|
||||
type Header struct {
|
||||
Metadata *Metadata
|
||||
Mapping []*BuildMap
|
||||
|
||||
// blockStarts tracks which block indices start a new BuildMap entry.
|
||||
// startMap provides direct access from block index to the BuildMap.
|
||||
blockStarts []bool
|
||||
startMap map[int64]*BuildMap
|
||||
}
|
||||
|
||||
// NewHeader creates a Header from metadata and mapping entries.
|
||||
// If mapping is nil/empty, a single entry covering the full size is created.
|
||||
func NewHeader(metadata *Metadata, mapping []*BuildMap) (*Header, error) {
|
||||
if metadata.BlockSize == 0 {
|
||||
return nil, fmt.Errorf("block size cannot be zero")
|
||||
}
|
||||
|
||||
if len(mapping) == 0 {
|
||||
mapping = []*BuildMap{{
|
||||
Offset: 0,
|
||||
Length: metadata.Size,
|
||||
BuildID: metadata.BuildID,
|
||||
BuildStorageOffset: 0,
|
||||
}}
|
||||
}
|
||||
|
||||
blocks := TotalBlocks(int64(metadata.Size), int64(metadata.BlockSize))
|
||||
starts := make([]bool, blocks)
|
||||
startMap := make(map[int64]*BuildMap, len(mapping))
|
||||
|
||||
for _, m := range mapping {
|
||||
idx := BlockIdx(int64(m.Offset), int64(metadata.BlockSize))
|
||||
if idx >= 0 && idx < blocks {
|
||||
starts[idx] = true
|
||||
startMap[idx] = m
|
||||
}
|
||||
}
|
||||
|
||||
return &Header{
|
||||
Metadata: metadata,
|
||||
Mapping: mapping,
|
||||
blockStarts: starts,
|
||||
startMap: startMap,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// GetShiftedMapping resolves a memory offset to the corresponding diff file
|
||||
// offset, remaining length, and build ID. This is the hot path called for
|
||||
// every UFFD page fault.
|
||||
func (h *Header) GetShiftedMapping(_ context.Context, offset int64) (mappedOffset int64, mappedLength int64, buildID *uuid.UUID, err error) {
|
||||
if offset < 0 || offset >= int64(h.Metadata.Size) {
|
||||
return 0, 0, nil, fmt.Errorf("offset %d out of bounds (size: %d)", offset, h.Metadata.Size)
|
||||
}
|
||||
|
||||
blockSize := int64(h.Metadata.BlockSize)
|
||||
block := BlockIdx(offset, blockSize)
|
||||
|
||||
// Walk backwards to find the BuildMap that contains this block.
|
||||
start := block
|
||||
for start >= 0 {
|
||||
if h.blockStarts[start] {
|
||||
break
|
||||
}
|
||||
start--
|
||||
}
|
||||
if start < 0 {
|
||||
return 0, 0, nil, fmt.Errorf("no mapping found for offset %d", offset)
|
||||
}
|
||||
|
||||
m, ok := h.startMap[start]
|
||||
if !ok {
|
||||
return 0, 0, nil, fmt.Errorf("no mapping at block %d", start)
|
||||
}
|
||||
|
||||
shift := (block - start) * blockSize
|
||||
if shift >= int64(m.Length) {
|
||||
return 0, 0, nil, fmt.Errorf("offset %d beyond mapping end (mapping offset=%d, length=%d)", offset, m.Offset, m.Length)
|
||||
}
|
||||
|
||||
return int64(m.BuildStorageOffset) + shift, int64(m.Length) - shift, &m.BuildID, nil
|
||||
}
|
||||
|
||||
// Serialize writes metadata + mapping entries to binary (little-endian).
|
||||
func Serialize(metadata *Metadata, mappings []*BuildMap) ([]byte, error) {
|
||||
var buf bytes.Buffer
|
||||
|
||||
if err := binary.Write(&buf, binary.LittleEndian, metadata); err != nil {
|
||||
return nil, fmt.Errorf("write metadata: %w", err)
|
||||
}
|
||||
|
||||
for _, m := range mappings {
|
||||
if err := binary.Write(&buf, binary.LittleEndian, m); err != nil {
|
||||
return nil, fmt.Errorf("write mapping: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
return buf.Bytes(), nil
|
||||
}
|
||||
|
||||
// Deserialize reads a header from binary data.
|
||||
func Deserialize(data []byte) (*Header, error) {
|
||||
reader := bytes.NewReader(data)
|
||||
|
||||
var metadata Metadata
|
||||
if err := binary.Read(reader, binary.LittleEndian, &metadata); err != nil {
|
||||
return nil, fmt.Errorf("read metadata: %w", err)
|
||||
}
|
||||
|
||||
var mappings []*BuildMap
|
||||
for {
|
||||
var m BuildMap
|
||||
if err := binary.Read(reader, binary.LittleEndian, &m); err != nil {
|
||||
if errors.Is(err, io.EOF) {
|
||||
break
|
||||
}
|
||||
return nil, fmt.Errorf("read mapping: %w", err)
|
||||
}
|
||||
mappings = append(mappings, &m)
|
||||
}
|
||||
|
||||
return NewHeader(&metadata, mappings)
|
||||
}
|
||||
|
||||
// Block index helpers.
|
||||
|
||||
func TotalBlocks(size, blockSize int64) int64 {
|
||||
return (size + blockSize - 1) / blockSize
|
||||
}
|
||||
|
||||
func BlockIdx(offset, blockSize int64) int64 {
|
||||
return offset / blockSize
|
||||
}
|
||||
|
||||
func BlockOffset(idx, blockSize int64) int64 {
|
||||
return idx * blockSize
|
||||
}
|
||||
@ -7,14 +7,15 @@ import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"syscall"
|
||||
|
||||
"github.com/google/uuid"
|
||||
)
|
||||
|
||||
const (
|
||||
SnapFileName = "snapfile"
|
||||
MemDiffName = "memfile"
|
||||
MemHeaderName = "memfile.header"
|
||||
// Cloud Hypervisor snapshot files.
|
||||
CHConfigFile = "config.json"
|
||||
CHMemRangesFile = "memory-ranges"
|
||||
CHStateFile = "state.json"
|
||||
|
||||
// Rootfs files.
|
||||
RootfsFileName = "rootfs.ext4"
|
||||
RootfsCowName = "rootfs.cow"
|
||||
RootfsMetaName = "rootfs.meta"
|
||||
@ -25,27 +26,6 @@ func DirPath(baseDir, name string) string {
|
||||
return filepath.Join(baseDir, name)
|
||||
}
|
||||
|
||||
// SnapPath returns the path to the VM state snapshot file.
|
||||
func SnapPath(baseDir, name string) string {
|
||||
return filepath.Join(DirPath(baseDir, name), SnapFileName)
|
||||
}
|
||||
|
||||
// MemDiffPath returns the path to the compact memory diff file (legacy single-generation).
|
||||
func MemDiffPath(baseDir, name string) string {
|
||||
return filepath.Join(DirPath(baseDir, name), MemDiffName)
|
||||
}
|
||||
|
||||
// MemDiffPathForBuild returns the path to a specific generation's diff file.
|
||||
// Format: memfile.{buildID}
|
||||
func MemDiffPathForBuild(baseDir, name string, buildID uuid.UUID) string {
|
||||
return filepath.Join(DirPath(baseDir, name), fmt.Sprintf("memfile.%s", buildID.String()))
|
||||
}
|
||||
|
||||
// MemHeaderPath returns the path to the memory mapping header file.
|
||||
func MemHeaderPath(baseDir, name string) string {
|
||||
return filepath.Join(DirPath(baseDir, name), MemHeaderName)
|
||||
}
|
||||
|
||||
// RootfsPath returns the path to the rootfs image.
|
||||
func RootfsPath(baseDir, name string) string {
|
||||
return filepath.Join(DirPath(baseDir, name), RootfsFileName)
|
||||
@ -61,10 +41,13 @@ func MetaPath(baseDir, name string) string {
|
||||
return filepath.Join(DirPath(baseDir, name), RootfsMetaName)
|
||||
}
|
||||
|
||||
// RootfsMeta records which base template a CoW file was created against.
|
||||
// RootfsMeta records which base template a CoW file was created against
|
||||
// and the VM resource config needed to restart the sampler on resume.
|
||||
type RootfsMeta struct {
|
||||
BaseTemplate string `json:"base_template"`
|
||||
TemplateID string `json:"template_id,omitempty"`
|
||||
VCPUs int `json:"vcpus,omitempty"`
|
||||
MemoryMB int `json:"memory_mb,omitempty"`
|
||||
}
|
||||
|
||||
// WriteMeta writes rootfs metadata to the snapshot directory.
|
||||
@ -92,102 +75,6 @@ func ReadMeta(baseDir, name string) (*RootfsMeta, error) {
|
||||
return &meta, nil
|
||||
}
|
||||
|
||||
// Exists reports whether a complete snapshot exists (all required files present).
|
||||
// Supports both legacy (rootfs.ext4) and CoW-based (rootfs.cow + rootfs.meta) snapshots.
|
||||
// Memory diff files can be either legacy "memfile" or generation-specific "memfile.{uuid}".
|
||||
func Exists(baseDir, name string) bool {
|
||||
dir := DirPath(baseDir, name)
|
||||
|
||||
// snapfile and header are always required.
|
||||
for _, f := range []string{SnapFileName, MemHeaderName} {
|
||||
if _, err := os.Stat(filepath.Join(dir, f)); err != nil {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// Check that at least one memfile exists (legacy or generation-specific).
|
||||
// We verify by reading the header and checking that referenced diff files exist.
|
||||
// Fall back to checking for the legacy memfile name if header can't be read.
|
||||
if _, err := os.Stat(filepath.Join(dir, MemDiffName)); err != nil {
|
||||
// No legacy memfile — check if any memfile.{uuid} exists by
|
||||
// looking for files matching the pattern.
|
||||
matches, _ := filepath.Glob(filepath.Join(dir, "memfile.*"))
|
||||
hasGenDiff := false
|
||||
for _, m := range matches {
|
||||
base := filepath.Base(m)
|
||||
if base != MemHeaderName {
|
||||
hasGenDiff = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !hasGenDiff {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// Accept either rootfs.ext4 (legacy/template) or rootfs.cow + rootfs.meta (dm-snapshot).
|
||||
if _, err := os.Stat(filepath.Join(dir, RootfsFileName)); err == nil {
|
||||
return true
|
||||
}
|
||||
if _, err := os.Stat(filepath.Join(dir, RootfsCowName)); err == nil {
|
||||
if _, err := os.Stat(filepath.Join(dir, RootfsMetaName)); err == nil {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// IsTemplate reports whether a template image directory exists (has rootfs.ext4).
|
||||
func IsTemplate(baseDir, name string) bool {
|
||||
_, err := os.Stat(filepath.Join(DirPath(baseDir, name), RootfsFileName))
|
||||
return err == nil
|
||||
}
|
||||
|
||||
// IsSnapshot reports whether a directory is a snapshot (has all snapshot files).
|
||||
func IsSnapshot(baseDir, name string) bool {
|
||||
return Exists(baseDir, name)
|
||||
}
|
||||
|
||||
// HasCow reports whether a snapshot uses CoW format (rootfs.cow + rootfs.meta)
|
||||
// as opposed to legacy full rootfs (rootfs.ext4).
|
||||
func HasCow(baseDir, name string) bool {
|
||||
dir := DirPath(baseDir, name)
|
||||
_, cowErr := os.Stat(filepath.Join(dir, RootfsCowName))
|
||||
_, metaErr := os.Stat(filepath.Join(dir, RootfsMetaName))
|
||||
return cowErr == nil && metaErr == nil
|
||||
}
|
||||
|
||||
// ListDiffFiles returns a map of build ID → file path for all memory diff files
|
||||
// referenced by the given header. Handles both the legacy "memfile" name
|
||||
// (single-generation) and generation-specific "memfile.{uuid}" names.
|
||||
func ListDiffFiles(baseDir, name string, header *Header) (map[string]string, error) {
|
||||
dir := DirPath(baseDir, name)
|
||||
result := make(map[string]string)
|
||||
|
||||
for _, m := range header.Mapping {
|
||||
if m.BuildID == uuid.Nil {
|
||||
continue // zero-fill, no file needed
|
||||
}
|
||||
idStr := m.BuildID.String()
|
||||
if _, exists := result[idStr]; exists {
|
||||
continue
|
||||
}
|
||||
// Try generation-specific path first, fall back to legacy.
|
||||
genPath := filepath.Join(dir, fmt.Sprintf("memfile.%s", idStr))
|
||||
if _, err := os.Stat(genPath); err == nil {
|
||||
result[idStr] = genPath
|
||||
continue
|
||||
}
|
||||
legacyPath := filepath.Join(dir, MemDiffName)
|
||||
if _, err := os.Stat(legacyPath); err == nil {
|
||||
result[idStr] = legacyPath
|
||||
continue
|
||||
}
|
||||
return nil, fmt.Errorf("diff file not found for build %s", idStr)
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// EnsureDir creates the snapshot directory if it doesn't exist.
|
||||
func EnsureDir(baseDir, name string) error {
|
||||
dir := DirPath(baseDir, name)
|
||||
|
||||
@ -1,214 +0,0 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// Modifications by M/S Omukk
|
||||
|
||||
package snapshot
|
||||
|
||||
import "github.com/google/uuid"
|
||||
|
||||
// CreateMapping converts a dirty-block bitset (represented as a []bool) into
|
||||
// a sorted list of BuildMap entries. Consecutive dirty blocks are merged into
|
||||
// a single entry. BuildStorageOffset tracks the sequential position in the
|
||||
// compact diff file.
|
||||
func CreateMapping(buildID uuid.UUID, dirty []bool, blockSize int64) []*BuildMap {
|
||||
var mappings []*BuildMap
|
||||
var runStart int64 = -1
|
||||
var runLength int64
|
||||
var storageOffset uint64
|
||||
|
||||
for i, set := range dirty {
|
||||
if !set {
|
||||
if runLength > 0 {
|
||||
mappings = append(mappings, &BuildMap{
|
||||
Offset: uint64(runStart) * uint64(blockSize),
|
||||
Length: uint64(runLength) * uint64(blockSize),
|
||||
BuildID: buildID,
|
||||
BuildStorageOffset: storageOffset,
|
||||
})
|
||||
storageOffset += uint64(runLength) * uint64(blockSize)
|
||||
runLength = 0
|
||||
}
|
||||
runStart = -1
|
||||
continue
|
||||
}
|
||||
|
||||
if runStart < 0 {
|
||||
runStart = int64(i)
|
||||
runLength = 1
|
||||
} else {
|
||||
runLength++
|
||||
}
|
||||
}
|
||||
|
||||
if runLength > 0 {
|
||||
mappings = append(mappings, &BuildMap{
|
||||
Offset: uint64(runStart) * uint64(blockSize),
|
||||
Length: uint64(runLength) * uint64(blockSize),
|
||||
BuildID: buildID,
|
||||
BuildStorageOffset: storageOffset,
|
||||
})
|
||||
}
|
||||
|
||||
return mappings
|
||||
}
|
||||
|
||||
// MergeMappings overlays diffMapping on top of baseMapping. Where they overlap,
|
||||
// diff takes priority. The result covers the entire address space.
|
||||
//
|
||||
// Both inputs must be sorted by Offset. The base mapping should cover the full size.
|
||||
//
|
||||
// Inspired by e2b's snapshot system (Apache 2.0, modified by Omukk).
|
||||
func MergeMappings(baseMapping, diffMapping []*BuildMap) []*BuildMap {
|
||||
if len(diffMapping) == 0 {
|
||||
return baseMapping
|
||||
}
|
||||
|
||||
// Work on a copy of baseMapping to avoid mutating the original.
|
||||
baseCopy := make([]*BuildMap, len(baseMapping))
|
||||
for i, m := range baseMapping {
|
||||
cp := *m
|
||||
baseCopy[i] = &cp
|
||||
}
|
||||
|
||||
var result []*BuildMap
|
||||
var bi, di int
|
||||
|
||||
for bi < len(baseCopy) && di < len(diffMapping) {
|
||||
base := baseCopy[bi]
|
||||
diff := diffMapping[di]
|
||||
|
||||
if base.Length == 0 {
|
||||
bi++
|
||||
continue
|
||||
}
|
||||
if diff.Length == 0 {
|
||||
di++
|
||||
continue
|
||||
}
|
||||
|
||||
// No overlap: base entirely before diff.
|
||||
if base.Offset+base.Length <= diff.Offset {
|
||||
result = append(result, base)
|
||||
bi++
|
||||
continue
|
||||
}
|
||||
|
||||
// No overlap: diff entirely before base.
|
||||
if diff.Offset+diff.Length <= base.Offset {
|
||||
result = append(result, diff)
|
||||
di++
|
||||
continue
|
||||
}
|
||||
|
||||
// Base fully inside diff — skip base.
|
||||
if base.Offset >= diff.Offset && base.Offset+base.Length <= diff.Offset+diff.Length {
|
||||
bi++
|
||||
continue
|
||||
}
|
||||
|
||||
// Diff fully inside base — split base around diff.
|
||||
if diff.Offset >= base.Offset && diff.Offset+diff.Length <= base.Offset+base.Length {
|
||||
leftLen := int64(diff.Offset) - int64(base.Offset)
|
||||
if leftLen > 0 {
|
||||
result = append(result, &BuildMap{
|
||||
Offset: base.Offset,
|
||||
Length: uint64(leftLen),
|
||||
BuildID: base.BuildID,
|
||||
BuildStorageOffset: base.BuildStorageOffset,
|
||||
})
|
||||
}
|
||||
|
||||
result = append(result, diff)
|
||||
di++
|
||||
|
||||
rightShift := int64(diff.Offset) + int64(diff.Length) - int64(base.Offset)
|
||||
rightLen := int64(base.Length) - rightShift
|
||||
|
||||
if rightLen > 0 {
|
||||
baseCopy[bi] = &BuildMap{
|
||||
Offset: base.Offset + uint64(rightShift),
|
||||
Length: uint64(rightLen),
|
||||
BuildID: base.BuildID,
|
||||
BuildStorageOffset: base.BuildStorageOffset + uint64(rightShift),
|
||||
}
|
||||
} else {
|
||||
bi++
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
// Base starts after diff with overlap — emit diff, trim base.
|
||||
if base.Offset > diff.Offset {
|
||||
result = append(result, diff)
|
||||
di++
|
||||
|
||||
rightShift := int64(diff.Offset) + int64(diff.Length) - int64(base.Offset)
|
||||
rightLen := int64(base.Length) - rightShift
|
||||
|
||||
if rightLen > 0 {
|
||||
baseCopy[bi] = &BuildMap{
|
||||
Offset: base.Offset + uint64(rightShift),
|
||||
Length: uint64(rightLen),
|
||||
BuildID: base.BuildID,
|
||||
BuildStorageOffset: base.BuildStorageOffset + uint64(rightShift),
|
||||
}
|
||||
} else {
|
||||
bi++
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
// Diff starts after base with overlap — emit left part of base.
|
||||
if diff.Offset > base.Offset {
|
||||
leftLen := int64(diff.Offset) - int64(base.Offset)
|
||||
if leftLen > 0 {
|
||||
result = append(result, &BuildMap{
|
||||
Offset: base.Offset,
|
||||
Length: uint64(leftLen),
|
||||
BuildID: base.BuildID,
|
||||
BuildStorageOffset: base.BuildStorageOffset,
|
||||
})
|
||||
}
|
||||
bi++
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
// Append remaining entries.
|
||||
result = append(result, baseCopy[bi:]...)
|
||||
result = append(result, diffMapping[di:]...)
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
// NormalizeMappings merges adjacent entries with the same BuildID.
|
||||
func NormalizeMappings(mappings []*BuildMap) []*BuildMap {
|
||||
if len(mappings) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
result := make([]*BuildMap, 0, len(mappings))
|
||||
current := &BuildMap{
|
||||
Offset: mappings[0].Offset,
|
||||
Length: mappings[0].Length,
|
||||
BuildID: mappings[0].BuildID,
|
||||
BuildStorageOffset: mappings[0].BuildStorageOffset,
|
||||
}
|
||||
|
||||
for i := 1; i < len(mappings); i++ {
|
||||
m := mappings[i]
|
||||
if m.BuildID == current.BuildID {
|
||||
current.Length += m.Length
|
||||
} else {
|
||||
result = append(result, current)
|
||||
current = &BuildMap{
|
||||
Offset: m.Offset,
|
||||
Length: m.Length,
|
||||
BuildID: m.BuildID,
|
||||
BuildStorageOffset: m.BuildStorageOffset,
|
||||
}
|
||||
}
|
||||
}
|
||||
result = append(result, current)
|
||||
|
||||
return result
|
||||
}
|
||||
@ -1,285 +0,0 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// Modifications by M/S Omukk
|
||||
|
||||
package snapshot
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
|
||||
"github.com/google/uuid"
|
||||
)
|
||||
|
||||
const (
|
||||
// DefaultBlockSize is 4KB — standard page size for Firecracker.
|
||||
DefaultBlockSize int64 = 4096
|
||||
)
|
||||
|
||||
// ProcessMemfile reads a full memory file produced by Firecracker's
|
||||
// PUT /snapshot/create, identifies non-zero blocks, and writes only those
|
||||
// blocks to a compact diff file. Returns the Header describing the mapping.
|
||||
//
|
||||
// The output diff file contains non-zero blocks written sequentially.
|
||||
// The header maps each block in the full address space to either:
|
||||
// - A position in the diff file (for non-zero blocks)
|
||||
// - uuid.Nil (for zero/empty blocks, served as zeros without I/O)
|
||||
//
|
||||
// buildID identifies this snapshot generation in the header chain.
|
||||
func ProcessMemfile(memfilePath, diffPath, headerPath string, buildID uuid.UUID) (*Header, error) {
|
||||
src, err := os.Open(memfilePath)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("open memfile: %w", err)
|
||||
}
|
||||
defer src.Close()
|
||||
|
||||
info, err := src.Stat()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("stat memfile: %w", err)
|
||||
}
|
||||
memSize := info.Size()
|
||||
|
||||
dst, err := os.Create(diffPath)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("create diff file: %w", err)
|
||||
}
|
||||
defer dst.Close()
|
||||
|
||||
totalBlocks := TotalBlocks(memSize, DefaultBlockSize)
|
||||
dirty := make([]bool, totalBlocks)
|
||||
empty := make([]bool, totalBlocks)
|
||||
buf := make([]byte, DefaultBlockSize)
|
||||
|
||||
for i := int64(0); i < totalBlocks; i++ {
|
||||
n, err := io.ReadFull(src, buf)
|
||||
if err != nil && err != io.ErrUnexpectedEOF {
|
||||
return nil, fmt.Errorf("read block %d: %w", i, err)
|
||||
}
|
||||
|
||||
// Zero-pad the last block if it's short.
|
||||
if int64(n) < DefaultBlockSize {
|
||||
for j := n; j < int(DefaultBlockSize); j++ {
|
||||
buf[j] = 0
|
||||
}
|
||||
}
|
||||
|
||||
if isZeroBlock(buf) {
|
||||
empty[i] = true
|
||||
continue
|
||||
}
|
||||
|
||||
dirty[i] = true
|
||||
if _, err := dst.Write(buf); err != nil {
|
||||
return nil, fmt.Errorf("write diff block %d: %w", i, err)
|
||||
}
|
||||
}
|
||||
|
||||
// Build header.
|
||||
dirtyMappings := CreateMapping(buildID, dirty, DefaultBlockSize)
|
||||
emptyMappings := CreateMapping(uuid.Nil, empty, DefaultBlockSize)
|
||||
merged := MergeMappings(dirtyMappings, emptyMappings)
|
||||
normalized := NormalizeMappings(merged)
|
||||
|
||||
metadata := NewMetadata(buildID, uint64(DefaultBlockSize), uint64(memSize))
|
||||
header, err := NewHeader(metadata, normalized)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("create header: %w", err)
|
||||
}
|
||||
|
||||
// Write header to disk.
|
||||
headerData, err := Serialize(metadata, normalized)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("serialize header: %w", err)
|
||||
}
|
||||
if err := os.WriteFile(headerPath, headerData, 0644); err != nil {
|
||||
return nil, fmt.Errorf("write header: %w", err)
|
||||
}
|
||||
|
||||
return header, nil
|
||||
}
|
||||
|
||||
// ProcessMemfileWithParent processes a memory file as a new generation on top
|
||||
// of an existing parent header. The new diff file contains only blocks that
|
||||
// differ from what the parent header maps. This is used for re-pause of a
|
||||
// sandbox that was restored from a snapshot.
|
||||
func ProcessMemfileWithParent(memfilePath, diffPath, headerPath string, parentHeader *Header, buildID uuid.UUID) (*Header, error) {
|
||||
src, err := os.Open(memfilePath)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("open memfile: %w", err)
|
||||
}
|
||||
defer src.Close()
|
||||
|
||||
info, err := src.Stat()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("stat memfile: %w", err)
|
||||
}
|
||||
memSize := info.Size()
|
||||
|
||||
dst, err := os.Create(diffPath)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("create diff file: %w", err)
|
||||
}
|
||||
defer dst.Close()
|
||||
|
||||
totalBlocks := TotalBlocks(memSize, DefaultBlockSize)
|
||||
dirty := make([]bool, totalBlocks)
|
||||
buf := make([]byte, DefaultBlockSize)
|
||||
|
||||
for i := int64(0); i < totalBlocks; i++ {
|
||||
n, err := io.ReadFull(src, buf)
|
||||
if err != nil && err != io.ErrUnexpectedEOF {
|
||||
return nil, fmt.Errorf("read block %d: %w", i, err)
|
||||
}
|
||||
|
||||
if int64(n) < DefaultBlockSize {
|
||||
for j := n; j < int(DefaultBlockSize); j++ {
|
||||
buf[j] = 0
|
||||
}
|
||||
}
|
||||
|
||||
if isZeroBlock(buf) {
|
||||
// For a diff memfile, zero blocks mean "not dirtied since resume" —
|
||||
// they should inherit the parent's mapping, not be zero-filled.
|
||||
continue
|
||||
}
|
||||
|
||||
dirty[i] = true
|
||||
if _, err := dst.Write(buf); err != nil {
|
||||
return nil, fmt.Errorf("write diff block %d: %w", i, err)
|
||||
}
|
||||
}
|
||||
|
||||
// Only dirty blocks go into the diff overlay; MergeMappings preserves the
|
||||
// parent's mapping for everything else.
|
||||
dirtyMappings := CreateMapping(buildID, dirty, DefaultBlockSize)
|
||||
merged := MergeMappings(parentHeader.Mapping, dirtyMappings)
|
||||
normalized := NormalizeMappings(merged)
|
||||
|
||||
metadata := parentHeader.Metadata.NextGeneration(buildID)
|
||||
header, err := NewHeader(metadata, normalized)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("create header: %w", err)
|
||||
}
|
||||
|
||||
headerData, err := Serialize(metadata, normalized)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("serialize header: %w", err)
|
||||
}
|
||||
if err := os.WriteFile(headerPath, headerData, 0644); err != nil {
|
||||
return nil, fmt.Errorf("write header: %w", err)
|
||||
}
|
||||
|
||||
return header, nil
|
||||
}
|
||||
|
||||
// MergeDiffs consolidates multiple generation diff files into a single diff
|
||||
// file and resets the generation counter to 0. This is a pure file-level
|
||||
// operation — no Firecracker involvement.
|
||||
//
|
||||
// It reads each non-nil block from the appropriate diff file (as mapped by
|
||||
// the header), writes them all sequentially into a single new diff file,
|
||||
// and produces a fresh header pointing only at that file.
|
||||
//
|
||||
// diffFiles maps build ID (string) → open file path for each generation's diff.
|
||||
func MergeDiffs(header *Header, diffFiles map[string]string, mergedDiffPath, headerPath string) (*Header, error) {
|
||||
blockSize := int64(header.Metadata.BlockSize)
|
||||
mergedBuildID := uuid.New()
|
||||
|
||||
// Open all source diff files.
|
||||
sources := make(map[string]*os.File, len(diffFiles))
|
||||
for id, path := range diffFiles {
|
||||
f, err := os.Open(path)
|
||||
if err != nil {
|
||||
// Close already opened files.
|
||||
for _, sf := range sources {
|
||||
sf.Close()
|
||||
}
|
||||
return nil, fmt.Errorf("open diff file for build %s: %w", id, err)
|
||||
}
|
||||
sources[id] = f
|
||||
}
|
||||
defer func() {
|
||||
for _, f := range sources {
|
||||
f.Close()
|
||||
}
|
||||
}()
|
||||
|
||||
dst, err := os.Create(mergedDiffPath)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("create merged diff file: %w", err)
|
||||
}
|
||||
defer dst.Close()
|
||||
|
||||
totalBlocks := TotalBlocks(int64(header.Metadata.Size), blockSize)
|
||||
dirty := make([]bool, totalBlocks)
|
||||
empty := make([]bool, totalBlocks)
|
||||
buf := make([]byte, blockSize)
|
||||
|
||||
for i := int64(0); i < totalBlocks; i++ {
|
||||
offset := i * blockSize
|
||||
mappedOffset, _, buildID, err := header.GetShiftedMapping(context.Background(), offset)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("lookup block %d: %w", i, err)
|
||||
}
|
||||
|
||||
if *buildID == uuid.Nil {
|
||||
empty[i] = true
|
||||
continue
|
||||
}
|
||||
|
||||
src, ok := sources[buildID.String()]
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("no diff file for build %s (block %d)", buildID, i)
|
||||
}
|
||||
|
||||
if _, err := src.ReadAt(buf, mappedOffset); err != nil {
|
||||
return nil, fmt.Errorf("read block %d from build %s: %w", i, buildID, err)
|
||||
}
|
||||
|
||||
dirty[i] = true
|
||||
if _, err := dst.Write(buf); err != nil {
|
||||
return nil, fmt.Errorf("write merged block %d: %w", i, err)
|
||||
}
|
||||
}
|
||||
|
||||
// Build fresh header with generation 0.
|
||||
dirtyMappings := CreateMapping(mergedBuildID, dirty, blockSize)
|
||||
emptyMappings := CreateMapping(uuid.Nil, empty, blockSize)
|
||||
merged := MergeMappings(dirtyMappings, emptyMappings)
|
||||
normalized := NormalizeMappings(merged)
|
||||
|
||||
metadata := NewMetadata(mergedBuildID, uint64(blockSize), header.Metadata.Size)
|
||||
newHeader, err := NewHeader(metadata, normalized)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("create merged header: %w", err)
|
||||
}
|
||||
|
||||
headerData, err := Serialize(metadata, normalized)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("serialize merged header: %w", err)
|
||||
}
|
||||
if err := os.WriteFile(headerPath, headerData, 0644); err != nil {
|
||||
return nil, fmt.Errorf("write merged header: %w", err)
|
||||
}
|
||||
|
||||
return newHeader, nil
|
||||
}
|
||||
|
||||
// isZeroBlock checks if a block is entirely zero bytes.
|
||||
func isZeroBlock(block []byte) bool {
|
||||
// Fast path: compare 8 bytes at a time.
|
||||
for i := 0; i+8 <= len(block); i += 8 {
|
||||
if block[i] != 0 || block[i+1] != 0 || block[i+2] != 0 || block[i+3] != 0 ||
|
||||
block[i+4] != 0 || block[i+5] != 0 || block[i+6] != 0 || block[i+7] != 0 {
|
||||
return false
|
||||
}
|
||||
}
|
||||
// Tail bytes.
|
||||
for i := len(block) &^ 7; i < len(block); i++ {
|
||||
if block[i] != 0 {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
@ -1,92 +0,0 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// Modifications by M/S Omukk
|
||||
|
||||
// Package uffd implements a userfaultfd-based memory server for Firecracker
|
||||
// snapshot restore. When a VM is restored from a snapshot, instead of loading
|
||||
// the entire memory file upfront, the UFFD handler intercepts page faults
|
||||
// and serves memory pages on demand from the snapshot's compact diff file.
|
||||
package uffd
|
||||
|
||||
/*
|
||||
#include <sys/syscall.h>
|
||||
#include <fcntl.h>
|
||||
#include <linux/userfaultfd.h>
|
||||
#include <sys/ioctl.h>
|
||||
|
||||
struct uffd_pagefault {
|
||||
__u64 flags;
|
||||
__u64 address;
|
||||
__u32 ptid;
|
||||
};
|
||||
*/
|
||||
import "C"
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"syscall"
|
||||
"unsafe"
|
||||
)
|
||||
|
||||
const (
|
||||
UFFD_EVENT_PAGEFAULT = C.UFFD_EVENT_PAGEFAULT
|
||||
UFFD_EVENT_FORK = C.UFFD_EVENT_FORK
|
||||
UFFD_EVENT_REMAP = C.UFFD_EVENT_REMAP
|
||||
UFFD_EVENT_REMOVE = C.UFFD_EVENT_REMOVE
|
||||
UFFD_EVENT_UNMAP = C.UFFD_EVENT_UNMAP
|
||||
UFFD_PAGEFAULT_FLAG_WRITE = C.UFFD_PAGEFAULT_FLAG_WRITE
|
||||
UFFDIO_COPY = C.UFFDIO_COPY
|
||||
UFFDIO_COPY_MODE_WP = C.UFFDIO_COPY_MODE_WP
|
||||
)
|
||||
|
||||
type (
|
||||
uffdMsg = C.struct_uffd_msg
|
||||
uffdPagefault = C.struct_uffd_pagefault
|
||||
uffdioCopy = C.struct_uffdio_copy
|
||||
)
|
||||
|
||||
// fd wraps a userfaultfd file descriptor received from Firecracker.
|
||||
type fd uintptr
|
||||
|
||||
// copy installs a page into guest memory at the given address using UFFDIO_COPY.
|
||||
// mode controls write-protection: use UFFDIO_COPY_MODE_WP to preserve WP bit.
|
||||
func (f fd) copy(addr, pagesize uintptr, data []byte, mode C.ulonglong) error {
|
||||
alignedAddr := addr &^ (pagesize - 1)
|
||||
cpy := uffdioCopy{
|
||||
src: C.ulonglong(uintptr(unsafe.Pointer(&data[0]))),
|
||||
dst: C.ulonglong(alignedAddr),
|
||||
len: C.ulonglong(pagesize),
|
||||
mode: mode,
|
||||
copy: 0,
|
||||
}
|
||||
|
||||
_, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(f), UFFDIO_COPY, uintptr(unsafe.Pointer(&cpy)))
|
||||
if errno != 0 {
|
||||
return errno
|
||||
}
|
||||
|
||||
if cpy.copy != C.longlong(pagesize) {
|
||||
return fmt.Errorf("UFFDIO_COPY copied %d bytes, expected %d", cpy.copy, pagesize)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// close closes the userfaultfd file descriptor.
|
||||
func (f fd) close() error {
|
||||
return syscall.Close(int(f))
|
||||
}
|
||||
|
||||
// getMsgEvent extracts the event type from a uffd_msg.
|
||||
func getMsgEvent(msg *uffdMsg) C.uchar {
|
||||
return msg.event
|
||||
}
|
||||
|
||||
// getMsgArg extracts the arg union from a uffd_msg.
|
||||
func getMsgArg(msg *uffdMsg) [24]byte {
|
||||
return msg.arg
|
||||
}
|
||||
|
||||
// getPagefaultAddress extracts the faulting address from a uffd_pagefault.
|
||||
func getPagefaultAddress(pf *uffdPagefault) uintptr {
|
||||
return uintptr(pf.address)
|
||||
}
|
||||
@ -1,41 +0,0 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// Modifications by M/S Omukk
|
||||
//
|
||||
// Modifications by Omukk (Wrenn Sandbox): merged Region and Mapping into
|
||||
// single file, inlined shiftedOffset helper.
|
||||
|
||||
package uffd
|
||||
|
||||
import "fmt"
|
||||
|
||||
// Region is a mapping of guest memory to host virtual address space.
|
||||
// Firecracker sends these as JSON when connecting to the UFFD socket.
|
||||
// The JSON field names match Firecracker's UFFD protocol.
|
||||
type Region struct {
|
||||
BaseHostVirtAddr uintptr `json:"base_host_virt_addr"`
|
||||
Size uintptr `json:"size"`
|
||||
Offset uintptr `json:"offset"`
|
||||
PageSize uintptr `json:"page_size_kib"` // Actually in bytes despite the name.
|
||||
}
|
||||
|
||||
// Mapping translates between host virtual addresses and logical memory offsets.
|
||||
type Mapping struct {
|
||||
Regions []Region
|
||||
}
|
||||
|
||||
// NewMapping creates a Mapping from a list of regions.
|
||||
func NewMapping(regions []Region) *Mapping {
|
||||
return &Mapping{Regions: regions}
|
||||
}
|
||||
|
||||
// GetOffset converts a host virtual address to a logical memory file offset
|
||||
// and returns the page size. This is called on every UFFD page fault.
|
||||
func (m *Mapping) GetOffset(hostVirtAddr uintptr) (int64, uintptr, error) {
|
||||
for _, r := range m.Regions {
|
||||
if hostVirtAddr >= r.BaseHostVirtAddr && hostVirtAddr < r.BaseHostVirtAddr+r.Size {
|
||||
offset := int64(hostVirtAddr-r.BaseHostVirtAddr) + int64(r.Offset)
|
||||
return offset, r.PageSize, nil
|
||||
}
|
||||
}
|
||||
return 0, 0, fmt.Errorf("address %#x not found in any memory region", hostVirtAddr)
|
||||
}
|
||||
@ -1,451 +0,0 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
// Modifications by M/S Omukk
|
||||
//
|
||||
// Modifications by Omukk (Wrenn Sandbox): replaced errgroup with WaitGroup
|
||||
// + semaphore, replaced fdexit abstraction with pipe, integrated with
|
||||
// snapshot.Header-based DiffFileSource instead of block.ReadonlyDevice,
|
||||
// fixed EAGAIN handling in poll loop.
|
||||
|
||||
package uffd
|
||||
|
||||
/*
|
||||
#include <linux/userfaultfd.h>
|
||||
*/
|
||||
import "C"
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"net"
|
||||
"os"
|
||||
"sync"
|
||||
"syscall"
|
||||
"unsafe"
|
||||
|
||||
"golang.org/x/sys/unix"
|
||||
|
||||
"git.omukk.dev/wrenn/wrenn/internal/snapshot"
|
||||
)
|
||||
|
||||
const (
|
||||
fdSize = 4
|
||||
regionMappingsSize = 1024
|
||||
maxConcurrentFaults = 4096
|
||||
)
|
||||
|
||||
// MemorySource provides page data for the UFFD handler.
|
||||
// Given a logical memory offset and a size, it returns the page data.
|
||||
type MemorySource interface {
|
||||
ReadPage(ctx context.Context, offset int64, size int64) ([]byte, error)
|
||||
}
|
||||
|
||||
// Server manages the UFFD Unix socket lifecycle and page fault handling
|
||||
// for a single Firecracker snapshot restore.
|
||||
type Server struct {
|
||||
socketPath string
|
||||
source MemorySource
|
||||
lis *net.UnixListener
|
||||
|
||||
readyCh chan struct{}
|
||||
readyOnce sync.Once
|
||||
doneCh chan struct{}
|
||||
doneErr error
|
||||
|
||||
// exitPipe signals the poll loop to stop.
|
||||
exitR *os.File
|
||||
exitW *os.File
|
||||
|
||||
// Set by handle() after Firecracker connects; read by Prefetch()
|
||||
// after waiting on readyCh (which establishes happens-before).
|
||||
uffdFd fd
|
||||
mapping *Mapping
|
||||
|
||||
// Prefetch lifecycle: cancel stops the goroutine, prefetchDone is
|
||||
// closed when it exits. Stop() drains prefetchDone before returning
|
||||
// so the caller can safely close diff file handles.
|
||||
prefetchCancel context.CancelFunc
|
||||
prefetchDone chan struct{}
|
||||
}
|
||||
|
||||
// NewServer creates a UFFD server that will listen on the given socket path
|
||||
// and serve memory pages from the given source.
|
||||
func NewServer(socketPath string, source MemorySource) *Server {
|
||||
return &Server{
|
||||
socketPath: socketPath,
|
||||
source: source,
|
||||
readyCh: make(chan struct{}),
|
||||
doneCh: make(chan struct{}),
|
||||
}
|
||||
}
|
||||
|
||||
// Start begins listening on the Unix socket. Firecracker will connect to this
|
||||
// socket after loadSnapshot is called with the UFFD backend.
|
||||
// Start returns immediately; the server runs in a background goroutine.
|
||||
func (s *Server) Start(ctx context.Context) error {
|
||||
lis, err := net.ListenUnix("unix", &net.UnixAddr{Name: s.socketPath, Net: "unix"})
|
||||
if err != nil {
|
||||
return fmt.Errorf("listen on uffd socket: %w", err)
|
||||
}
|
||||
s.lis = lis
|
||||
|
||||
if err := os.Chmod(s.socketPath, 0o777); err != nil {
|
||||
lis.Close()
|
||||
return fmt.Errorf("chmod uffd socket: %w", err)
|
||||
}
|
||||
|
||||
// Create exit signal pipe.
|
||||
r, w, err := os.Pipe()
|
||||
if err != nil {
|
||||
lis.Close()
|
||||
return fmt.Errorf("create exit pipe: %w", err)
|
||||
}
|
||||
s.exitR = r
|
||||
s.exitW = w
|
||||
|
||||
go func() {
|
||||
defer close(s.doneCh)
|
||||
s.doneErr = s.handle(ctx)
|
||||
s.lis.Close()
|
||||
s.exitR.Close()
|
||||
s.exitW.Close()
|
||||
s.readyOnce.Do(func() { close(s.readyCh) })
|
||||
}()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Ready returns a channel that is closed when the UFFD handler is ready
|
||||
// (after Firecracker has connected and sent the uffd fd).
|
||||
func (s *Server) Ready() <-chan struct{} {
|
||||
return s.readyCh
|
||||
}
|
||||
|
||||
// Stop signals the UFFD poll loop to exit and waits for it to finish.
|
||||
// Also cancels and waits for any running prefetch goroutine.
|
||||
func (s *Server) Stop() error {
|
||||
if s.prefetchCancel != nil {
|
||||
s.prefetchCancel()
|
||||
}
|
||||
// Write a byte to the exit pipe to wake the poll loop.
|
||||
_, _ = s.exitW.Write([]byte{0})
|
||||
<-s.doneCh
|
||||
if s.prefetchDone != nil {
|
||||
<-s.prefetchDone
|
||||
}
|
||||
return s.doneErr
|
||||
}
|
||||
|
||||
// Wait blocks until the server exits.
|
||||
func (s *Server) Wait() error {
|
||||
<-s.doneCh
|
||||
return s.doneErr
|
||||
}
|
||||
|
||||
// handle accepts the Firecracker connection, receives the UFFD fd via
|
||||
// SCM_RIGHTS, and runs the page fault poll loop.
|
||||
func (s *Server) handle(ctx context.Context) error {
|
||||
conn, err := s.lis.Accept()
|
||||
if err != nil {
|
||||
return fmt.Errorf("accept uffd connection: %w", err)
|
||||
}
|
||||
|
||||
unixConn := conn.(*net.UnixConn)
|
||||
defer unixConn.Close()
|
||||
|
||||
// Read the memory region mappings (JSON) and the UFFD fd (SCM_RIGHTS).
|
||||
regionBuf := make([]byte, regionMappingsSize)
|
||||
uffdBuf := make([]byte, syscall.CmsgSpace(fdSize))
|
||||
|
||||
nRegion, nFd, _, _, err := unixConn.ReadMsgUnix(regionBuf, uffdBuf)
|
||||
if err != nil {
|
||||
return fmt.Errorf("read uffd message: %w", err)
|
||||
}
|
||||
|
||||
var regions []Region
|
||||
if err := json.Unmarshal(regionBuf[:nRegion], ®ions); err != nil {
|
||||
return fmt.Errorf("parse memory regions: %w", err)
|
||||
}
|
||||
|
||||
controlMsgs, err := syscall.ParseSocketControlMessage(uffdBuf[:nFd])
|
||||
if err != nil {
|
||||
return fmt.Errorf("parse control messages: %w", err)
|
||||
}
|
||||
if len(controlMsgs) != 1 {
|
||||
return fmt.Errorf("expected 1 control message, got %d", len(controlMsgs))
|
||||
}
|
||||
|
||||
fds, err := syscall.ParseUnixRights(&controlMsgs[0])
|
||||
if err != nil {
|
||||
return fmt.Errorf("parse unix rights: %w", err)
|
||||
}
|
||||
if len(fds) != 1 {
|
||||
return fmt.Errorf("expected 1 fd, got %d", len(fds))
|
||||
}
|
||||
|
||||
uffdFd := fd(fds[0])
|
||||
defer uffdFd.close()
|
||||
|
||||
mapping := NewMapping(regions)
|
||||
|
||||
// Store for use by Prefetch().
|
||||
s.uffdFd = uffdFd
|
||||
s.mapping = mapping
|
||||
|
||||
slog.Info("uffd handler connected",
|
||||
"regions", len(regions),
|
||||
"fd", int(uffdFd),
|
||||
)
|
||||
|
||||
// Signal readiness.
|
||||
s.readyOnce.Do(func() { close(s.readyCh) })
|
||||
|
||||
// Run the poll loop.
|
||||
return s.serve(ctx, uffdFd, mapping)
|
||||
}
|
||||
|
||||
// serve is the main poll loop. It polls the UFFD fd for page fault events
|
||||
// and the exit pipe for shutdown signals.
|
||||
func (s *Server) serve(ctx context.Context, uffdFd fd, mapping *Mapping) error {
|
||||
pollFds := []unix.PollFd{
|
||||
{Fd: int32(uffdFd), Events: unix.POLLIN},
|
||||
{Fd: int32(s.exitR.Fd()), Events: unix.POLLIN},
|
||||
}
|
||||
|
||||
var wg sync.WaitGroup
|
||||
sem := make(chan struct{}, maxConcurrentFaults)
|
||||
|
||||
// Always wait for in-flight goroutines before returning, so the caller
|
||||
// can safely close the uffd fd after serve returns.
|
||||
defer wg.Wait()
|
||||
|
||||
for {
|
||||
if _, err := unix.Poll(pollFds, -1); err != nil {
|
||||
if err == unix.EINTR || err == unix.EAGAIN {
|
||||
continue
|
||||
}
|
||||
return fmt.Errorf("poll: %w", err)
|
||||
}
|
||||
|
||||
// Check exit signal.
|
||||
if pollFds[1].Revents&unix.POLLIN != 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
if pollFds[0].Revents&unix.POLLIN == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
// Read the uffd_msg. The fd is O_NONBLOCK (set by Firecracker),
|
||||
// so EAGAIN is expected — just go back to poll.
|
||||
buf := make([]byte, unsafe.Sizeof(uffdMsg{}))
|
||||
n, err := readUffdMsg(uffdFd, buf)
|
||||
if err == syscall.EAGAIN {
|
||||
continue
|
||||
}
|
||||
if err != nil {
|
||||
return fmt.Errorf("read uffd msg: %w", err)
|
||||
}
|
||||
if n == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
msg := *(*uffdMsg)(unsafe.Pointer(&buf[0]))
|
||||
event := getMsgEvent(&msg)
|
||||
|
||||
switch event {
|
||||
case UFFD_EVENT_PAGEFAULT:
|
||||
// Handled below.
|
||||
case UFFD_EVENT_REMOVE, UFFD_EVENT_UNMAP, UFFD_EVENT_REMAP, UFFD_EVENT_FORK:
|
||||
// Non-fatal lifecycle events from the guest kernel (e.g. balloon
|
||||
// deflation, mmap/munmap). No action needed — continue polling.
|
||||
continue
|
||||
default:
|
||||
return fmt.Errorf("unexpected uffd event type: %d", event)
|
||||
}
|
||||
|
||||
arg := getMsgArg(&msg)
|
||||
pf := *(*uffdPagefault)(unsafe.Pointer(&arg[0]))
|
||||
addr := getPagefaultAddress(&pf)
|
||||
|
||||
offset, pagesize, err := mapping.GetOffset(addr)
|
||||
if err != nil {
|
||||
return fmt.Errorf("resolve address %#x: %w", addr, err)
|
||||
}
|
||||
|
||||
sem <- struct{}{}
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
defer func() { <-sem }()
|
||||
|
||||
if err := s.faultPage(ctx, uffdFd, addr, offset, pagesize); err != nil {
|
||||
slog.Error("uffd fault page error",
|
||||
"addr", fmt.Sprintf("%#x", addr),
|
||||
"offset", offset,
|
||||
"error", err,
|
||||
)
|
||||
}
|
||||
}()
|
||||
}
|
||||
}
|
||||
|
||||
// readUffdMsg reads a single uffd_msg, retrying on EINTR.
|
||||
// Returns (n, EAGAIN) if the non-blocking read has nothing available.
|
||||
func readUffdMsg(uffdFd fd, buf []byte) (int, error) {
|
||||
for {
|
||||
n, err := syscall.Read(int(uffdFd), buf)
|
||||
if err == syscall.EINTR {
|
||||
continue
|
||||
}
|
||||
return n, err
|
||||
}
|
||||
}
|
||||
|
||||
// faultPage fetches a page from the memory source and copies it into
|
||||
// guest memory via UFFDIO_COPY.
|
||||
func (s *Server) faultPage(ctx context.Context, uffdFd fd, addr uintptr, offset int64, pagesize uintptr) error {
|
||||
data, err := s.source.ReadPage(ctx, offset, int64(pagesize))
|
||||
if err != nil {
|
||||
return fmt.Errorf("read page at offset %d: %w", offset, err)
|
||||
}
|
||||
|
||||
// Mode 0: no write-protect. Standard Firecracker does not register
|
||||
// UFFD ranges with WP support, so UFFDIO_COPY_MODE_WP would fail.
|
||||
if err := uffdFd.copy(addr, pagesize, data, 0); err != nil {
|
||||
if errors.Is(err, unix.EEXIST) {
|
||||
// Page already mapped (race with prefetch or concurrent fault).
|
||||
return nil
|
||||
}
|
||||
return fmt.Errorf("uffdio_copy: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Prefetch proactively loads all guest memory pages in the background.
|
||||
// It iterates over every page in every UFFD region and copies it from the
|
||||
// diff file into guest memory via UFFDIO_COPY. Pages already loaded by
|
||||
// on-demand faults return nil from faultPage (EEXIST handled internally).
|
||||
// This eliminates the per-request latency caused by lazy page faulting
|
||||
// after snapshot restore.
|
||||
//
|
||||
// The goroutine blocks on readyCh before reading the uffd fd and mapping
|
||||
// fields (establishes happens-before with handle()). It uses an internal
|
||||
// context independent of the caller's RPC context so it survives after the
|
||||
// create/resume RPC returns. Stop() cancels and joins the goroutine.
|
||||
func (s *Server) Prefetch() {
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
s.prefetchCancel = cancel
|
||||
s.prefetchDone = make(chan struct{})
|
||||
|
||||
go func() {
|
||||
defer close(s.prefetchDone)
|
||||
|
||||
// Wait for Firecracker to connect and send the uffd fd.
|
||||
select {
|
||||
case <-s.readyCh:
|
||||
case <-ctx.Done():
|
||||
return
|
||||
}
|
||||
|
||||
uffdFd := s.uffdFd
|
||||
mapping := s.mapping
|
||||
if mapping == nil {
|
||||
return
|
||||
}
|
||||
|
||||
var total, errored int
|
||||
for _, region := range mapping.Regions {
|
||||
pageSize := region.PageSize
|
||||
if pageSize == 0 {
|
||||
continue
|
||||
}
|
||||
for off := uintptr(0); off < region.Size; off += pageSize {
|
||||
if ctx.Err() != nil {
|
||||
slog.Debug("uffd prefetch cancelled",
|
||||
"pages", total, "errors", errored)
|
||||
return
|
||||
}
|
||||
|
||||
addr := region.BaseHostVirtAddr + off
|
||||
memOffset := int64(off) + int64(region.Offset)
|
||||
|
||||
if err := s.faultPage(ctx, uffdFd, addr, memOffset, pageSize); err != nil {
|
||||
errored++
|
||||
} else {
|
||||
total++
|
||||
}
|
||||
}
|
||||
}
|
||||
slog.Info("uffd prefetch complete",
|
||||
"pages", total, "errors", errored)
|
||||
}()
|
||||
}
|
||||
|
||||
// DiffFileSource serves pages from a snapshot's compact diff file using
|
||||
// the header's block mapping to resolve offsets.
|
||||
type DiffFileSource struct {
|
||||
header *snapshot.Header
|
||||
// diffs maps build ID → open file handle for each generation's diff file.
|
||||
diffs map[string]*os.File
|
||||
}
|
||||
|
||||
// NewDiffFileSource creates a memory source backed by snapshot diff files.
|
||||
// diffs maps build ID string to the file path of each generation's diff file.
|
||||
func NewDiffFileSource(header *snapshot.Header, diffPaths map[string]string) (*DiffFileSource, error) {
|
||||
diffs := make(map[string]*os.File, len(diffPaths))
|
||||
for id, path := range diffPaths {
|
||||
f, err := os.Open(path)
|
||||
if err != nil {
|
||||
// Close already opened files.
|
||||
for _, opened := range diffs {
|
||||
opened.Close()
|
||||
}
|
||||
return nil, fmt.Errorf("open diff file %s: %w", path, err)
|
||||
}
|
||||
diffs[id] = f
|
||||
}
|
||||
return &DiffFileSource{header: header, diffs: diffs}, nil
|
||||
}
|
||||
|
||||
// ReadPage resolves a memory offset through the header mapping and reads
|
||||
// the corresponding page from the correct generation's diff file.
|
||||
func (s *DiffFileSource) ReadPage(ctx context.Context, offset int64, size int64) ([]byte, error) {
|
||||
mappedOffset, _, buildID, err := s.header.GetShiftedMapping(ctx, offset)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("resolve offset %d: %w", offset, err)
|
||||
}
|
||||
|
||||
// uuid.Nil means zero-fill (empty page).
|
||||
var nilUUID [16]byte
|
||||
if *buildID == nilUUID {
|
||||
return make([]byte, size), nil
|
||||
}
|
||||
|
||||
f, ok := s.diffs[buildID.String()]
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("no diff file for build %s", buildID)
|
||||
}
|
||||
|
||||
buf := make([]byte, size)
|
||||
n, err := f.ReadAt(buf, mappedOffset)
|
||||
if err != nil && int64(n) < size {
|
||||
return nil, fmt.Errorf("read diff at offset %d: %w", mappedOffset, err)
|
||||
}
|
||||
|
||||
return buf, nil
|
||||
}
|
||||
|
||||
// Close closes all open diff file handles.
|
||||
func (s *DiffFileSource) Close() error {
|
||||
var errs []error
|
||||
for _, f := range s.diffs {
|
||||
if err := f.Close(); err != nil {
|
||||
errs = append(errs, err)
|
||||
}
|
||||
}
|
||||
return errors.Join(errs...)
|
||||
}
|
||||
213
internal/vm/ch.go
Normal file
213
internal/vm/ch.go
Normal file
@ -0,0 +1,213 @@
|
||||
package vm
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net"
|
||||
"net/http"
|
||||
)
|
||||
|
||||
// chClient talks to the Cloud Hypervisor HTTP API over a Unix socket.
|
||||
type chClient struct {
|
||||
http *http.Client
|
||||
socketPath string
|
||||
}
|
||||
|
||||
func newCHClient(socketPath string) *chClient {
|
||||
return &chClient{
|
||||
socketPath: socketPath,
|
||||
http: &http.Client{
|
||||
Transport: &http.Transport{
|
||||
DialContext: func(ctx context.Context, _, _ string) (net.Conn, error) {
|
||||
var d net.Dialer
|
||||
return d.DialContext(ctx, "unix", socketPath)
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (c *chClient) do(ctx context.Context, method, path string, body any) error {
|
||||
var bodyReader io.Reader
|
||||
if body != nil {
|
||||
data, err := json.Marshal(body)
|
||||
if err != nil {
|
||||
return fmt.Errorf("marshal request body: %w", err)
|
||||
}
|
||||
bodyReader = bytes.NewReader(data)
|
||||
}
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, method, "http://localhost"+path, bodyReader)
|
||||
if err != nil {
|
||||
return fmt.Errorf("create request: %w", err)
|
||||
}
|
||||
if body != nil {
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
}
|
||||
|
||||
resp, err := c.http.Do(req)
|
||||
if err != nil {
|
||||
return fmt.Errorf("%s %s: %w", method, path, err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode >= 300 {
|
||||
respBody, _ := io.ReadAll(resp.Body)
|
||||
return fmt.Errorf("%s %s: status %d: %s", method, path, resp.StatusCode, string(respBody))
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// --- CH API payload types ---
|
||||
|
||||
type chPayload struct {
|
||||
Firmware string `json:"firmware,omitempty"`
|
||||
Kernel string `json:"kernel"`
|
||||
Cmdline string `json:"cmdline"`
|
||||
}
|
||||
|
||||
type chCPUs struct {
|
||||
BootVCPUs int `json:"boot_vcpus"`
|
||||
MaxVCPUs int `json:"max_vcpus"`
|
||||
}
|
||||
|
||||
type chMemory struct {
|
||||
Size uint64 `json:"size"`
|
||||
Shared bool `json:"shared,omitempty"`
|
||||
HotplugSize uint64 `json:"hotplug_size,omitempty"`
|
||||
HotplugMethod string `json:"hotplug_method,omitempty"`
|
||||
}
|
||||
|
||||
type chDisk struct {
|
||||
Path string `json:"path"`
|
||||
Readonly bool `json:"readonly,omitempty"`
|
||||
ImageType string `json:"image_type,omitempty"`
|
||||
}
|
||||
|
||||
type chNet struct {
|
||||
Tap string `json:"tap"`
|
||||
MAC string `json:"mac"`
|
||||
NumQs int `json:"num_queues,omitempty"`
|
||||
QueueS int `json:"queue_size,omitempty"`
|
||||
}
|
||||
|
||||
type chBalloon struct {
|
||||
Size int64 `json:"size"`
|
||||
DeflateOnOOM bool `json:"deflate_on_oom"`
|
||||
FreePageRep bool `json:"free_page_reporting,omitempty"`
|
||||
}
|
||||
|
||||
type chConsole struct {
|
||||
Mode string `json:"mode"`
|
||||
}
|
||||
|
||||
type chCreatePayload struct {
|
||||
Payload chPayload `json:"payload"`
|
||||
CPUs chCPUs `json:"cpus"`
|
||||
Memory chMemory `json:"memory"`
|
||||
Disks []chDisk `json:"disks"`
|
||||
Net []chNet `json:"net"`
|
||||
Balloon *chBalloon `json:"balloon,omitempty"`
|
||||
Serial chConsole `json:"serial"`
|
||||
Console chConsole `json:"console"`
|
||||
}
|
||||
|
||||
// createVM sends the full VM configuration as a single payload.
|
||||
func (c *chClient) createVM(ctx context.Context, cfg *VMConfig) error {
|
||||
memBytes := uint64(cfg.MemoryMB) * 1024 * 1024
|
||||
|
||||
payload := chCreatePayload{
|
||||
Payload: chPayload{
|
||||
Kernel: cfg.KernelPath,
|
||||
Cmdline: cfg.kernelArgs(),
|
||||
},
|
||||
CPUs: chCPUs{
|
||||
BootVCPUs: cfg.VCPUs,
|
||||
MaxVCPUs: cfg.VCPUs,
|
||||
},
|
||||
Memory: chMemory{
|
||||
Size: memBytes,
|
||||
Shared: true,
|
||||
},
|
||||
Disks: []chDisk{
|
||||
{
|
||||
Path: cfg.SandboxDir + "/rootfs.ext4",
|
||||
ImageType: "Raw",
|
||||
},
|
||||
},
|
||||
Net: []chNet{
|
||||
{
|
||||
Tap: cfg.TapDevice,
|
||||
MAC: cfg.TapMAC,
|
||||
},
|
||||
},
|
||||
Balloon: &chBalloon{
|
||||
Size: 0,
|
||||
DeflateOnOOM: true,
|
||||
FreePageRep: true,
|
||||
},
|
||||
Serial: chConsole{
|
||||
Mode: "Tty",
|
||||
},
|
||||
Console: chConsole{
|
||||
Mode: "Off",
|
||||
},
|
||||
}
|
||||
|
||||
return c.do(ctx, http.MethodPut, "/api/v1/vm.create", payload)
|
||||
}
|
||||
|
||||
// bootVM starts the VM after creation.
|
||||
func (c *chClient) bootVM(ctx context.Context) error {
|
||||
return c.do(ctx, http.MethodPut, "/api/v1/vm.boot", nil)
|
||||
}
|
||||
|
||||
// pauseVM pauses the microVM.
|
||||
func (c *chClient) pauseVM(ctx context.Context) error {
|
||||
return c.do(ctx, http.MethodPut, "/api/v1/vm.pause", nil)
|
||||
}
|
||||
|
||||
// resumeVM resumes a paused microVM.
|
||||
func (c *chClient) resumeVM(ctx context.Context) error {
|
||||
return c.do(ctx, http.MethodPut, "/api/v1/vm.resume", nil)
|
||||
}
|
||||
|
||||
// snapshotVM creates a VM snapshot to the given directory.
|
||||
func (c *chClient) snapshotVM(ctx context.Context, destURL string) error {
|
||||
return c.do(ctx, http.MethodPut, "/api/v1/vm.snapshot", map[string]string{
|
||||
"destination_url": destURL,
|
||||
})
|
||||
}
|
||||
|
||||
// restoreVM restores a VM from a snapshot via the API. Uses OnDemand memory
|
||||
// restore mode for UFFD-based lazy page loading — only pages the guest
|
||||
// actually touches are faulted in from disk.
|
||||
func (c *chClient) restoreVM(ctx context.Context, sourceURL string) error {
|
||||
return c.do(ctx, http.MethodPut, "/api/v1/vm.restore", map[string]any{
|
||||
"source_url": sourceURL,
|
||||
"memory_restore_mode": "OnDemand",
|
||||
"resume": true,
|
||||
})
|
||||
}
|
||||
|
||||
// shutdownVMM cleanly shuts down the Cloud Hypervisor VMM process.
|
||||
func (c *chClient) shutdownVMM(ctx context.Context) error {
|
||||
return c.do(ctx, http.MethodPut, "/api/v1/vmm.shutdown", nil)
|
||||
}
|
||||
|
||||
// resizeBalloon adjusts the balloon target at runtime.
|
||||
// sizeBytes is memory to take FROM the guest (0 = give all back).
|
||||
func (c *chClient) resizeBalloon(ctx context.Context, sizeBytes int64) error {
|
||||
return c.do(ctx, http.MethodPut, "/api/v1/vm.resize", map[string]int64{
|
||||
"desired_balloon": sizeBytes,
|
||||
})
|
||||
}
|
||||
|
||||
// ping checks if the VMM is alive and ready to accept commands.
|
||||
func (c *chClient) ping(ctx context.Context) error {
|
||||
return c.do(ctx, http.MethodGet, "/api/v1/vmm.ping", nil)
|
||||
}
|
||||
@ -2,13 +2,12 @@ package vm
|
||||
|
||||
import "fmt"
|
||||
|
||||
// VMConfig holds the configuration for creating a Firecracker microVM.
|
||||
// VMConfig holds the configuration for creating a Cloud Hypervisor microVM.
|
||||
type VMConfig struct {
|
||||
// SandboxID is the unique identifier for this sandbox (e.g., "cl-a1b2c3d4").
|
||||
SandboxID string
|
||||
|
||||
// TemplateID is the template UUID string used to populate MMDS metadata
|
||||
// so that envd can read WRENN_TEMPLATE_ID from inside the guest.
|
||||
// TemplateID is the template UUID string, passed to envd via PostInit.
|
||||
TemplateID string
|
||||
|
||||
// KernelPath is the path to the uncompressed Linux kernel (vmlinux).
|
||||
@ -25,12 +24,12 @@ type VMConfig struct {
|
||||
MemoryMB int
|
||||
|
||||
// NetworkNamespace is the name of the network namespace to launch
|
||||
// Firecracker inside (e.g., "ns-1"). The namespace must already exist
|
||||
// Cloud Hypervisor inside (e.g., "ns-1"). The namespace must already exist
|
||||
// with a TAP device configured.
|
||||
NetworkNamespace string
|
||||
|
||||
// TapDevice is the name of the TAP device inside the network namespace
|
||||
// that Firecracker will attach to (e.g., "tap0").
|
||||
// that Cloud Hypervisor will attach to (e.g., "tap0").
|
||||
TapDevice string
|
||||
|
||||
// TapMAC is the MAC address for the TAP device.
|
||||
@ -45,19 +44,23 @@ type VMConfig struct {
|
||||
// NetMask is the subnet mask for the guest network (e.g., "255.255.255.252").
|
||||
NetMask string
|
||||
|
||||
// FirecrackerBin is the path to the firecracker binary.
|
||||
FirecrackerBin string
|
||||
// VMMBin is the path to the cloud-hypervisor binary.
|
||||
VMMBin string
|
||||
|
||||
// SocketPath is the path for the Firecracker API Unix socket.
|
||||
// SocketPath is the path for the Cloud Hypervisor API Unix socket.
|
||||
SocketPath string
|
||||
|
||||
// SandboxDir is the tmpfs mount point for per-sandbox files inside the
|
||||
// mount namespace (e.g., "/fc-vm").
|
||||
// mount namespace (e.g., "/ch-vm").
|
||||
SandboxDir string
|
||||
|
||||
// InitPath is the path to the init process inside the guest.
|
||||
// Defaults to "/sbin/init" if empty.
|
||||
InitPath string
|
||||
|
||||
// SnapshotDir is the path to the snapshot directory for restore.
|
||||
// Only set when restoring from a snapshot.
|
||||
SnapshotDir string
|
||||
}
|
||||
|
||||
func (c *VMConfig) applyDefaults() {
|
||||
@ -67,14 +70,14 @@ func (c *VMConfig) applyDefaults() {
|
||||
if c.MemoryMB == 0 {
|
||||
c.MemoryMB = 512
|
||||
}
|
||||
if c.FirecrackerBin == "" {
|
||||
c.FirecrackerBin = "/usr/local/bin/firecracker"
|
||||
if c.VMMBin == "" {
|
||||
c.VMMBin = "/usr/local/bin/cloud-hypervisor"
|
||||
}
|
||||
if c.SocketPath == "" {
|
||||
c.SocketPath = fmt.Sprintf("/tmp/fc-%s.sock", c.SandboxID)
|
||||
c.SocketPath = fmt.Sprintf("/tmp/ch-%s.sock", c.SandboxID)
|
||||
}
|
||||
if c.SandboxDir == "" {
|
||||
c.SandboxDir = "/tmp/fc-vm"
|
||||
c.SandboxDir = "/tmp/ch-vm"
|
||||
}
|
||||
if c.TapDevice == "" {
|
||||
c.TapDevice = "tap0"
|
||||
@ -95,7 +98,7 @@ func (c *VMConfig) kernelArgs() string {
|
||||
)
|
||||
|
||||
return fmt.Sprintf(
|
||||
"console=ttyS0 reboot=k panic=1 pci=off quiet loglevel=1 clocksource=kvm-clock init=%s %s",
|
||||
"console=ttyS0 root=/dev/vda rw reboot=k panic=1 quiet loglevel=1 init_on_free=1 clocksource=kvm-clock init=%s %s",
|
||||
c.InitPath, ipArg,
|
||||
)
|
||||
}
|
||||
|
||||
@ -1,202 +0,0 @@
|
||||
package vm
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net"
|
||||
"net/http"
|
||||
)
|
||||
|
||||
// fcClient talks to the Firecracker HTTP API over a Unix socket.
|
||||
type fcClient struct {
|
||||
http *http.Client
|
||||
socketPath string
|
||||
}
|
||||
|
||||
func newFCClient(socketPath string) *fcClient {
|
||||
return &fcClient{
|
||||
socketPath: socketPath,
|
||||
http: &http.Client{
|
||||
Transport: &http.Transport{
|
||||
DialContext: func(ctx context.Context, _, _ string) (net.Conn, error) {
|
||||
var d net.Dialer
|
||||
return d.DialContext(ctx, "unix", socketPath)
|
||||
},
|
||||
},
|
||||
// No global timeout — callers pass context.Context with appropriate
|
||||
// deadlines. A fixed 10s timeout was too short for snapshot/resume
|
||||
// operations on large-memory VMs (20GB+ memfiles).
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (c *fcClient) do(ctx context.Context, method, path string, body any) error {
|
||||
var bodyReader io.Reader
|
||||
if body != nil {
|
||||
data, err := json.Marshal(body)
|
||||
if err != nil {
|
||||
return fmt.Errorf("marshal request body: %w", err)
|
||||
}
|
||||
bodyReader = bytes.NewReader(data)
|
||||
}
|
||||
|
||||
// The host in the URL is ignored for Unix sockets; we use "localhost" by convention.
|
||||
req, err := http.NewRequestWithContext(ctx, method, "http://localhost"+path, bodyReader)
|
||||
if err != nil {
|
||||
return fmt.Errorf("create request: %w", err)
|
||||
}
|
||||
if body != nil {
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
}
|
||||
|
||||
resp, err := c.http.Do(req)
|
||||
if err != nil {
|
||||
return fmt.Errorf("%s %s: %w", method, path, err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode >= 300 {
|
||||
respBody, _ := io.ReadAll(resp.Body)
|
||||
return fmt.Errorf("%s %s: status %d: %s", method, path, resp.StatusCode, string(respBody))
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// setBootSource configures the kernel and boot args.
|
||||
func (c *fcClient) setBootSource(ctx context.Context, kernelPath, bootArgs string) error {
|
||||
return c.do(ctx, http.MethodPut, "/boot-source", map[string]string{
|
||||
"kernel_image_path": kernelPath,
|
||||
"boot_args": bootArgs,
|
||||
})
|
||||
}
|
||||
|
||||
// setRootfsDrive configures the root filesystem drive.
|
||||
func (c *fcClient) setRootfsDrive(ctx context.Context, driveID, path string, readOnly bool) error {
|
||||
return c.do(ctx, http.MethodPut, "/drives/"+driveID, map[string]any{
|
||||
"drive_id": driveID,
|
||||
"path_on_host": path,
|
||||
"is_root_device": true,
|
||||
"is_read_only": readOnly,
|
||||
})
|
||||
}
|
||||
|
||||
// setNetworkInterface configures a network interface attached to a TAP device.
|
||||
// A tx_rate_limiter caps sustained guest→host throughput to prevent user
|
||||
// application traffic from completely saturating the TAP device and starving
|
||||
// envd control traffic (PTY, exec, file ops).
|
||||
func (c *fcClient) setNetworkInterface(ctx context.Context, ifaceID, tapName, macAddr string) error {
|
||||
return c.do(ctx, http.MethodPut, "/network-interfaces/"+ifaceID, map[string]any{
|
||||
"iface_id": ifaceID,
|
||||
"host_dev_name": tapName,
|
||||
"guest_mac": macAddr,
|
||||
"tx_rate_limiter": map[string]any{
|
||||
"bandwidth": map[string]any{
|
||||
"size": 209715200, // 200 MB/s sustained
|
||||
"refill_time": 1000, // refill period: 1 second
|
||||
"one_time_burst": 104857600, // 100 MB initial burst
|
||||
},
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
// setMachineConfig configures vCPUs, memory, and other machine settings.
|
||||
func (c *fcClient) setMachineConfig(ctx context.Context, vcpus, memMB int) error {
|
||||
return c.do(ctx, http.MethodPut, "/machine-config", map[string]any{
|
||||
"vcpu_count": vcpus,
|
||||
"mem_size_mib": memMB,
|
||||
"smt": false,
|
||||
})
|
||||
}
|
||||
|
||||
// setMMDSConfig enables MMDS V2 token-based access on the given network interface.
|
||||
// Must be called before startVM.
|
||||
func (c *fcClient) setMMDSConfig(ctx context.Context, ifaceID string) error {
|
||||
return c.do(ctx, http.MethodPut, "/mmds/config", map[string]any{
|
||||
"version": "V2",
|
||||
"network_interfaces": []string{ifaceID},
|
||||
})
|
||||
}
|
||||
|
||||
// mmdsMetadata is the metadata payload written to the Firecracker MMDS store.
|
||||
// envd reads this via PollForMMDSOpts to populate WRENN_SANDBOX_ID and WRENN_TEMPLATE_ID.
|
||||
type mmdsMetadata struct {
|
||||
SandboxID string `json:"instanceID"`
|
||||
TemplateID string `json:"envID"`
|
||||
}
|
||||
|
||||
// setMMDS writes sandbox metadata to the Firecracker MMDS store.
|
||||
// Can be called after the VM has started.
|
||||
func (c *fcClient) setMMDS(ctx context.Context, sandboxID, templateID string) error {
|
||||
return c.do(ctx, http.MethodPut, "/mmds", mmdsMetadata{
|
||||
SandboxID: sandboxID,
|
||||
TemplateID: templateID,
|
||||
})
|
||||
}
|
||||
|
||||
// setBalloon configures the Firecracker balloon device for dynamic memory
|
||||
// management. deflateOnOom lets the guest reclaim balloon pages under memory
|
||||
// pressure. statsInterval enables periodic stats via GET /balloon/statistics.
|
||||
// Must be called before startVM.
|
||||
func (c *fcClient) setBalloon(ctx context.Context, amountMiB int, deflateOnOom bool, statsIntervalS int) error {
|
||||
return c.do(ctx, http.MethodPut, "/balloon", map[string]any{
|
||||
"amount_mib": amountMiB,
|
||||
"deflate_on_oom": deflateOnOom,
|
||||
"stats_polling_interval_s": statsIntervalS,
|
||||
})
|
||||
}
|
||||
|
||||
// updateBalloon adjusts the balloon target at runtime.
|
||||
func (c *fcClient) updateBalloon(ctx context.Context, amountMiB int) error {
|
||||
return c.do(ctx, http.MethodPatch, "/balloon", map[string]any{
|
||||
"amount_mib": amountMiB,
|
||||
})
|
||||
}
|
||||
|
||||
// startVM issues the InstanceStart action.
|
||||
func (c *fcClient) startVM(ctx context.Context) error {
|
||||
return c.do(ctx, http.MethodPut, "/actions", map[string]string{
|
||||
"action_type": "InstanceStart",
|
||||
})
|
||||
}
|
||||
|
||||
// pauseVM pauses the microVM.
|
||||
func (c *fcClient) pauseVM(ctx context.Context) error {
|
||||
return c.do(ctx, http.MethodPatch, "/vm", map[string]string{
|
||||
"state": "Paused",
|
||||
})
|
||||
}
|
||||
|
||||
// resumeVM resumes a paused microVM.
|
||||
func (c *fcClient) resumeVM(ctx context.Context) error {
|
||||
return c.do(ctx, http.MethodPatch, "/vm", map[string]string{
|
||||
"state": "Resumed",
|
||||
})
|
||||
}
|
||||
|
||||
// createSnapshot creates a VM snapshot.
|
||||
// snapshotType is "Full" (all memory) or "Diff" (only dirty pages since last resume).
|
||||
func (c *fcClient) createSnapshot(ctx context.Context, snapPath, memPath, snapshotType string) error {
|
||||
return c.do(ctx, http.MethodPut, "/snapshot/create", map[string]any{
|
||||
"snapshot_type": snapshotType,
|
||||
"snapshot_path": snapPath,
|
||||
"mem_file_path": memPath,
|
||||
})
|
||||
}
|
||||
|
||||
// loadSnapshotWithUffd loads a VM snapshot using a UFFD socket for
|
||||
// lazy memory loading. Firecracker will connect to the socket and
|
||||
// send the uffd fd + memory region mappings.
|
||||
func (c *fcClient) loadSnapshotWithUffd(ctx context.Context, snapPath, uffdSocketPath string) error {
|
||||
return c.do(ctx, http.MethodPut, "/snapshot/load", map[string]any{
|
||||
"snapshot_path": snapPath,
|
||||
"resume_vm": false,
|
||||
"mem_backend": map[string]any{
|
||||
"backend_type": "Uffd",
|
||||
"backend_path": uffdSocketPath,
|
||||
},
|
||||
})
|
||||
}
|
||||
@ -9,14 +9,14 @@ import (
|
||||
"time"
|
||||
)
|
||||
|
||||
// VM represents a running Firecracker microVM.
|
||||
// VM represents a running Cloud Hypervisor microVM.
|
||||
type VM struct {
|
||||
Config VMConfig
|
||||
process *process
|
||||
client *fcClient
|
||||
client *chClient
|
||||
}
|
||||
|
||||
// Manager handles the lifecycle of Firecracker microVMs.
|
||||
// Manager handles the lifecycle of Cloud Hypervisor microVMs.
|
||||
type Manager struct {
|
||||
mu sync.RWMutex
|
||||
// vms tracks running VMs by sandbox ID.
|
||||
@ -30,7 +30,7 @@ func NewManager() *Manager {
|
||||
}
|
||||
}
|
||||
|
||||
// Create boots a new Firecracker microVM with the given configuration.
|
||||
// Create boots a new Cloud Hypervisor microVM with the given configuration.
|
||||
// The network namespace and TAP device must already be set up.
|
||||
func (m *Manager) Create(ctx context.Context, cfg VMConfig) (*VM, error) {
|
||||
cfg.applyDefaults()
|
||||
@ -38,7 +38,6 @@ func (m *Manager) Create(ctx context.Context, cfg VMConfig) (*VM, error) {
|
||||
return nil, fmt.Errorf("invalid config: %w", err)
|
||||
}
|
||||
|
||||
// Clean up any leftover socket from a previous run.
|
||||
os.Remove(cfg.SocketPath)
|
||||
|
||||
slog.Info("creating VM",
|
||||
@ -47,7 +46,7 @@ func (m *Manager) Create(ctx context.Context, cfg VMConfig) (*VM, error) {
|
||||
"memory_mb", cfg.MemoryMB,
|
||||
)
|
||||
|
||||
// Step 1: Launch the Firecracker process.
|
||||
// Step 1: Launch the Cloud Hypervisor process.
|
||||
proc, err := startProcess(ctx, &cfg)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("start process: %w", err)
|
||||
@ -59,25 +58,18 @@ func (m *Manager) Create(ctx context.Context, cfg VMConfig) (*VM, error) {
|
||||
return nil, fmt.Errorf("wait for socket: %w", err)
|
||||
}
|
||||
|
||||
// Step 3: Configure the VM via the Firecracker API.
|
||||
client := newFCClient(cfg.SocketPath)
|
||||
// Step 3: Configure and boot the VM via a single API call.
|
||||
client := newCHClient(cfg.SocketPath)
|
||||
|
||||
if err := configureVM(ctx, client, &cfg); err != nil {
|
||||
if err := client.createVM(ctx, &cfg); err != nil {
|
||||
_ = proc.stop()
|
||||
return nil, fmt.Errorf("configure VM: %w", err)
|
||||
return nil, fmt.Errorf("create VM config: %w", err)
|
||||
}
|
||||
|
||||
// Step 4: Start the VM.
|
||||
if err := client.startVM(ctx); err != nil {
|
||||
// Step 4: Boot the VM.
|
||||
if err := client.bootVM(ctx); err != nil {
|
||||
_ = proc.stop()
|
||||
return nil, fmt.Errorf("start VM: %w", err)
|
||||
}
|
||||
|
||||
// Step 5: Push sandbox metadata into MMDS so envd can read
|
||||
// WRENN_SANDBOX_ID and WRENN_TEMPLATE_ID from inside the guest.
|
||||
if err := client.setMMDS(ctx, cfg.SandboxID, cfg.TemplateID); err != nil {
|
||||
_ = proc.stop()
|
||||
return nil, fmt.Errorf("set MMDS metadata: %w", err)
|
||||
return nil, fmt.Errorf("boot VM: %w", err)
|
||||
}
|
||||
|
||||
vm := &VM{
|
||||
@ -95,46 +87,6 @@ func (m *Manager) Create(ctx context.Context, cfg VMConfig) (*VM, error) {
|
||||
return vm, nil
|
||||
}
|
||||
|
||||
// configureVM sends the configuration to Firecracker via its HTTP API.
|
||||
func configureVM(ctx context.Context, client *fcClient, cfg *VMConfig) error {
|
||||
// Boot source (kernel + args)
|
||||
if err := client.setBootSource(ctx, cfg.KernelPath, cfg.kernelArgs()); err != nil {
|
||||
return fmt.Errorf("set boot source: %w", err)
|
||||
}
|
||||
|
||||
// Root drive — use the symlink path inside the mount namespace so that
|
||||
// snapshots record a stable path that works on restore.
|
||||
rootfsSymlink := cfg.SandboxDir + "/rootfs.ext4"
|
||||
if err := client.setRootfsDrive(ctx, "rootfs", rootfsSymlink, false); err != nil {
|
||||
return fmt.Errorf("set rootfs drive: %w", err)
|
||||
}
|
||||
|
||||
// Network interface
|
||||
if err := client.setNetworkInterface(ctx, "eth0", cfg.TapDevice, cfg.TapMAC); err != nil {
|
||||
return fmt.Errorf("set network interface: %w", err)
|
||||
}
|
||||
|
||||
// Machine config (vCPUs + memory)
|
||||
if err := client.setMachineConfig(ctx, cfg.VCPUs, cfg.MemoryMB); err != nil {
|
||||
return fmt.Errorf("set machine config: %w", err)
|
||||
}
|
||||
|
||||
// Balloon device — allows the host to reclaim unused guest memory.
|
||||
// Start with 0 (no inflation). deflate_on_oom lets the guest reclaim
|
||||
// balloon pages under memory pressure. Stats interval enables monitoring.
|
||||
if err := client.setBalloon(ctx, 0, true, 5); err != nil {
|
||||
slog.Warn("set balloon failed (non-fatal, VM will run without memory reclaim)", "error", err)
|
||||
}
|
||||
|
||||
// MMDS config — enable V2 token access on eth0 so that envd can read
|
||||
// WRENN_SANDBOX_ID and WRENN_TEMPLATE_ID from inside the guest.
|
||||
if err := client.setMMDSConfig(ctx, "eth0"); err != nil {
|
||||
return fmt.Errorf("set MMDS config: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Pause pauses a running VM.
|
||||
func (m *Manager) Pause(ctx context.Context, sandboxID string) error {
|
||||
m.mu.RLock()
|
||||
@ -179,7 +131,8 @@ func (m *Manager) UpdateBalloon(ctx context.Context, sandboxID string, amountMiB
|
||||
return fmt.Errorf("VM not found: %s", sandboxID)
|
||||
}
|
||||
|
||||
return vm.client.updateBalloon(ctx, amountMiB)
|
||||
sizeBytes := int64(amountMiB) * 1024 * 1024
|
||||
return vm.client.resizeBalloon(ctx, sizeBytes)
|
||||
}
|
||||
|
||||
// Destroy stops and cleans up a VM.
|
||||
@ -195,12 +148,17 @@ func (m *Manager) Destroy(ctx context.Context, sandboxID string) error {
|
||||
|
||||
slog.Info("destroying VM", "sandbox", sandboxID)
|
||||
|
||||
// Stop the Firecracker process.
|
||||
// Try clean shutdown first, fall back to process kill.
|
||||
shutdownCtx, shutdownCancel := context.WithTimeout(ctx, 5*time.Second)
|
||||
if err := vm.client.shutdownVMM(shutdownCtx); err != nil {
|
||||
slog.Debug("clean VMM shutdown failed, killing process", "sandbox", sandboxID, "error", err)
|
||||
}
|
||||
shutdownCancel()
|
||||
|
||||
if err := vm.process.stop(); err != nil {
|
||||
slog.Warn("error stopping process", "sandbox", sandboxID, "error", err)
|
||||
}
|
||||
|
||||
// Clean up the API socket.
|
||||
os.Remove(vm.Config.SocketPath)
|
||||
|
||||
slog.Info("VM destroyed", "sandbox", sandboxID)
|
||||
@ -208,8 +166,8 @@ func (m *Manager) Destroy(ctx context.Context, sandboxID string) error {
|
||||
}
|
||||
|
||||
// Snapshot creates a VM snapshot. The VM must already be paused.
|
||||
// snapshotType is "Full" (all memory) or "Diff" (only dirty pages since last resume).
|
||||
func (m *Manager) Snapshot(ctx context.Context, sandboxID, snapPath, memPath, snapshotType string) error {
|
||||
// destURL is the file:// URL to the snapshot directory.
|
||||
func (m *Manager) Snapshot(ctx context.Context, sandboxID, snapshotDir string) error {
|
||||
m.mu.RLock()
|
||||
vm, ok := m.vms[sandboxID]
|
||||
m.mu.RUnlock()
|
||||
@ -217,29 +175,35 @@ func (m *Manager) Snapshot(ctx context.Context, sandboxID, snapPath, memPath, sn
|
||||
return fmt.Errorf("VM not found: %s", sandboxID)
|
||||
}
|
||||
|
||||
if err := vm.client.createSnapshot(ctx, snapPath, memPath, snapshotType); err != nil {
|
||||
destURL := "file://" + snapshotDir
|
||||
if err := vm.client.snapshotVM(ctx, destURL); err != nil {
|
||||
return fmt.Errorf("create snapshot: %w", err)
|
||||
}
|
||||
|
||||
slog.Info("VM snapshot created", "sandbox", sandboxID, "snap_path", snapPath, "type", snapshotType)
|
||||
slog.Info("VM snapshot created", "sandbox", sandboxID, "snapshot_dir", snapshotDir)
|
||||
return nil
|
||||
}
|
||||
|
||||
// CreateFromSnapshot boots a new Firecracker VM by loading a snapshot
|
||||
// using UFFD for lazy memory loading. The network namespace and TAP
|
||||
// device must already be set up.
|
||||
// CreateFromSnapshot boots a new Cloud Hypervisor VM by restoring from a
|
||||
// snapshot directory. The network namespace and TAP device must already be set up.
|
||||
//
|
||||
// No boot resources (kernel, drives, machine config) are configured —
|
||||
// the snapshot carries all that state. The rootfs path recorded in the
|
||||
// snapshot is resolved via a stable symlink at SandboxDir/rootfs.ext4
|
||||
// inside the mount namespace (created by the start script in jailer.go).
|
||||
// A bare CH process is started first, then the restore is performed via the API
|
||||
// with memory_restore_mode=OnDemand for UFFD-based lazy page loading. This means
|
||||
// only pages the guest actually touches are faulted in from disk — a 16GB template
|
||||
// with 2GB active working set only loads ~2GB into RAM at restore time.
|
||||
//
|
||||
// The restore API also sets resume=true, so the VM starts running immediately
|
||||
// without a separate resume call.
|
||||
//
|
||||
// The rootfs path recorded in the snapshot is resolved via a stable symlink at
|
||||
// SandboxDir/rootfs.ext4 inside the mount namespace.
|
||||
//
|
||||
// The sequence is:
|
||||
// 1. Start FC process in mount+network namespace (creates tmpfs + rootfs symlink)
|
||||
// 1. Start bare CH process in mount+network namespace
|
||||
// 2. Wait for API socket
|
||||
// 3. Load snapshot with UFFD backend
|
||||
// 4. Resume VM execution
|
||||
func (m *Manager) CreateFromSnapshot(ctx context.Context, cfg VMConfig, snapPath, uffdSocketPath string) (*VM, error) {
|
||||
// 3. Restore VM via API (OnDemand memory + auto-resume)
|
||||
func (m *Manager) CreateFromSnapshot(ctx context.Context, cfg VMConfig, snapshotDir string) (*VM, error) {
|
||||
cfg.SnapshotDir = snapshotDir
|
||||
cfg.applyDefaults()
|
||||
if err := cfg.validate(); err != nil {
|
||||
return nil, fmt.Errorf("invalid config: %w", err)
|
||||
@ -249,14 +213,11 @@ func (m *Manager) CreateFromSnapshot(ctx context.Context, cfg VMConfig, snapPath
|
||||
|
||||
slog.Info("restoring VM from snapshot",
|
||||
"sandbox", cfg.SandboxID,
|
||||
"snap_path", snapPath,
|
||||
"snapshot_dir", snapshotDir,
|
||||
)
|
||||
|
||||
// Step 1: Launch the Firecracker process.
|
||||
// The start script creates a tmpfs at SandboxDir and symlinks
|
||||
// rootfs.ext4 → cfg.RootfsPath, so the snapshot's recorded rootfs
|
||||
// path (/fc-vm/rootfs.ext4) resolves to the new clone.
|
||||
proc, err := startProcess(ctx, &cfg)
|
||||
// Step 1: Launch bare CH process (no --restore).
|
||||
proc, err := startProcessForRestore(ctx, &cfg)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("start process: %w", err)
|
||||
}
|
||||
@ -267,26 +228,13 @@ func (m *Manager) CreateFromSnapshot(ctx context.Context, cfg VMConfig, snapPath
|
||||
return nil, fmt.Errorf("wait for socket: %w", err)
|
||||
}
|
||||
|
||||
client := newFCClient(cfg.SocketPath)
|
||||
client := newCHClient(cfg.SocketPath)
|
||||
|
||||
// Step 3: Load the snapshot with UFFD backend.
|
||||
// No boot resources are configured — the snapshot carries kernel,
|
||||
// drive, network, and machine config state.
|
||||
if err := client.loadSnapshotWithUffd(ctx, snapPath, uffdSocketPath); err != nil {
|
||||
// Step 3: Restore via API with OnDemand memory + auto-resume.
|
||||
sourceURL := "file://" + snapshotDir
|
||||
if err := client.restoreVM(ctx, sourceURL); err != nil {
|
||||
_ = proc.stop()
|
||||
return nil, fmt.Errorf("load snapshot: %w", err)
|
||||
}
|
||||
|
||||
// Step 4: Resume the VM.
|
||||
if err := client.resumeVM(ctx); err != nil {
|
||||
_ = proc.stop()
|
||||
return nil, fmt.Errorf("resume VM: %w", err)
|
||||
}
|
||||
|
||||
// Step 5: Push sandbox metadata into MMDS.
|
||||
if err := client.setMMDS(ctx, cfg.SandboxID, cfg.TemplateID); err != nil {
|
||||
_ = proc.stop()
|
||||
return nil, fmt.Errorf("set MMDS metadata: %w", err)
|
||||
return nil, fmt.Errorf("restore VM: %w", err)
|
||||
}
|
||||
|
||||
vm := &VM{
|
||||
@ -304,11 +252,15 @@ func (m *Manager) CreateFromSnapshot(ctx context.Context, cfg VMConfig, snapPath
|
||||
}
|
||||
|
||||
// PID returns the process ID of the unshare wrapper process.
|
||||
// The actual Firecracker process is a direct child of this PID.
|
||||
func (v *VM) PID() int {
|
||||
return v.process.cmd.Process.Pid
|
||||
}
|
||||
|
||||
// Exited returns a channel that is closed when the VM process exits.
|
||||
func (v *VM) Exited() <-chan struct{} {
|
||||
return v.process.exited()
|
||||
}
|
||||
|
||||
// Get returns a running VM by sandbox ID.
|
||||
func (m *Manager) Get(sandboxID string) (*VM, bool) {
|
||||
m.mu.RLock()
|
||||
@ -317,7 +269,7 @@ func (m *Manager) Get(sandboxID string) (*VM, bool) {
|
||||
return vm, ok
|
||||
}
|
||||
|
||||
// waitForSocket polls for the Firecracker API socket to appear on disk.
|
||||
// waitForSocket polls for the Cloud Hypervisor API socket to appear on disk.
|
||||
func waitForSocket(ctx context.Context, socketPath string, proc *process) error {
|
||||
ticker := time.NewTicker(10 * time.Millisecond)
|
||||
defer ticker.Stop()
|
||||
@ -329,7 +281,7 @@ func waitForSocket(ctx context.Context, socketPath string, proc *process) error
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case <-proc.exited():
|
||||
return fmt.Errorf("firecracker process exited before socket was ready")
|
||||
return fmt.Errorf("cloud-hypervisor process exited before socket was ready")
|
||||
case <-timeout:
|
||||
return fmt.Errorf("timed out waiting for API socket at %s", socketPath)
|
||||
case <-ticker.C:
|
||||
|
||||
@ -10,7 +10,7 @@ import (
|
||||
"time"
|
||||
)
|
||||
|
||||
// process represents a running Firecracker process with mount and network
|
||||
// process represents a running Cloud Hypervisor process with mount and network
|
||||
// namespace isolation.
|
||||
type process struct {
|
||||
cmd *exec.Cmd
|
||||
@ -20,33 +20,42 @@ type process struct {
|
||||
exitErr error
|
||||
}
|
||||
|
||||
// startProcess launches the Firecracker binary inside an isolated mount namespace
|
||||
// and the specified network namespace. The launch sequence:
|
||||
// startProcess launches the Cloud Hypervisor binary inside an isolated mount
|
||||
// namespace and the specified network namespace. Used for fresh boot (no
|
||||
// snapshot). The launch sequence:
|
||||
//
|
||||
// 1. unshare -m: creates a private mount namespace
|
||||
// 2. mount --make-rprivate /: prevents mount propagation to host
|
||||
// 3. mount tmpfs at SandboxDir: ephemeral workspace for this VM
|
||||
// 4. symlink kernel and rootfs into SandboxDir
|
||||
// 5. ip netns exec <ns>: enters the network namespace where TAP is configured
|
||||
// 6. exec firecracker with the API socket path
|
||||
// 6. exec cloud-hypervisor with the API socket path
|
||||
func startProcess(ctx context.Context, cfg *VMConfig) (*process, error) {
|
||||
// Use a background context for the long-lived Firecracker process.
|
||||
// The request context (ctx) is only used for the startup phase — we must
|
||||
// not tie the VM's lifetime to the HTTP request that created it.
|
||||
execCtx, cancel := context.WithCancel(context.Background())
|
||||
|
||||
script := buildStartScript(cfg)
|
||||
return launchScript(script, cfg)
|
||||
}
|
||||
|
||||
// startProcessForRestore launches a bare Cloud Hypervisor process (no --restore).
|
||||
// The restore is performed via the API after the socket is ready, which allows
|
||||
// passing memory_restore_mode=OnDemand for UFFD lazy paging.
|
||||
func startProcessForRestore(ctx context.Context, cfg *VMConfig) (*process, error) {
|
||||
script := buildRestoreScript(cfg)
|
||||
return launchScript(script, cfg)
|
||||
}
|
||||
|
||||
func launchScript(script string, cfg *VMConfig) (*process, error) {
|
||||
execCtx, cancel := context.WithCancel(context.Background())
|
||||
|
||||
cmd := exec.CommandContext(execCtx, "unshare", "-m", "--", "bash", "-c", script)
|
||||
cmd.SysProcAttr = &syscall.SysProcAttr{
|
||||
Setsid: true, // new session so signals don't propagate from parent
|
||||
Setsid: true,
|
||||
}
|
||||
cmd.Stdout = os.Stdout
|
||||
cmd.Stderr = os.Stderr
|
||||
|
||||
if err := cmd.Start(); err != nil {
|
||||
cancel()
|
||||
return nil, fmt.Errorf("start firecracker process: %w", err)
|
||||
return nil, fmt.Errorf("start cloud-hypervisor process: %w", err)
|
||||
}
|
||||
|
||||
p := &process{
|
||||
@ -60,7 +69,7 @@ func startProcess(ctx context.Context, cfg *VMConfig) (*process, error) {
|
||||
close(p.exitCh)
|
||||
}()
|
||||
|
||||
slog.Info("firecracker process started",
|
||||
slog.Info("cloud-hypervisor process started",
|
||||
"pid", cmd.Process.Pid,
|
||||
"sandbox", cfg.SandboxID,
|
||||
)
|
||||
@ -68,35 +77,56 @@ func startProcess(ctx context.Context, cfg *VMConfig) (*process, error) {
|
||||
return p, nil
|
||||
}
|
||||
|
||||
// buildStartScript generates the bash script that sets up the mount namespace,
|
||||
// symlinks kernel/rootfs, and execs Firecracker inside the network namespace.
|
||||
// buildStartScript generates the bash script for fresh boot: sets up mount
|
||||
// namespace, symlinks kernel/rootfs, and execs Cloud Hypervisor.
|
||||
func buildStartScript(cfg *VMConfig) string {
|
||||
return fmt.Sprintf(`
|
||||
set -euo pipefail
|
||||
|
||||
# Prevent mount propagation to the host
|
||||
mount --make-rprivate /
|
||||
|
||||
# Create ephemeral tmpfs workspace
|
||||
mkdir -p %[1]s
|
||||
mount -t tmpfs tmpfs %[1]s
|
||||
|
||||
# Symlink kernel and rootfs into the workspace
|
||||
ln -s %[2]s %[1]s/vmlinux
|
||||
ln -s %[3]s %[1]s/rootfs.ext4
|
||||
|
||||
# Launch Firecracker inside the network namespace
|
||||
exec ip netns exec %[4]s %[5]s --api-sock %[6]s
|
||||
exec ip netns exec %[4]s %[5]s --api-socket path=%[6]s
|
||||
`,
|
||||
cfg.SandboxDir, // 1
|
||||
cfg.KernelPath, // 2
|
||||
cfg.RootfsPath, // 3
|
||||
cfg.NetworkNamespace, // 4
|
||||
cfg.FirecrackerBin, // 5
|
||||
cfg.VMMBin, // 5
|
||||
cfg.SocketPath, // 6
|
||||
)
|
||||
}
|
||||
|
||||
// buildRestoreScript generates the bash script for snapshot restore: sets up
|
||||
// mount namespace, symlinks rootfs, and starts a bare Cloud Hypervisor process.
|
||||
// The actual restore is done via the API (PUT /vm.restore) after the socket is
|
||||
// ready, which enables memory_restore_mode=OnDemand for UFFD lazy paging.
|
||||
func buildRestoreScript(cfg *VMConfig) string {
|
||||
return fmt.Sprintf(`
|
||||
set -euo pipefail
|
||||
|
||||
mount --make-rprivate /
|
||||
|
||||
mkdir -p %[1]s
|
||||
mount -t tmpfs tmpfs %[1]s
|
||||
|
||||
ln -s %[2]s %[1]s/rootfs.ext4
|
||||
|
||||
exec ip netns exec %[3]s %[4]s --api-socket path=%[5]s
|
||||
`,
|
||||
cfg.SandboxDir, // 1
|
||||
cfg.RootfsPath, // 2
|
||||
cfg.NetworkNamespace, // 3
|
||||
cfg.VMMBin, // 4
|
||||
cfg.SocketPath, // 5
|
||||
)
|
||||
}
|
||||
|
||||
// stop sends SIGTERM and waits for the process to exit. If it doesn't exit
|
||||
// within 10 seconds, SIGKILL is sent.
|
||||
func (p *process) stop() error {
|
||||
@ -104,7 +134,6 @@ func (p *process) stop() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Send SIGTERM to the process group (negative PID).
|
||||
if err := syscall.Kill(-p.cmd.Process.Pid, syscall.SIGTERM); err != nil {
|
||||
slog.Debug("sigterm failed, process may have exited", "error", err)
|
||||
}
|
||||
@ -113,7 +142,7 @@ func (p *process) stop() error {
|
||||
case <-p.exitCh:
|
||||
return nil
|
||||
case <-time.After(10 * time.Second):
|
||||
slog.Warn("firecracker did not exit after SIGTERM, sending SIGKILL")
|
||||
slog.Warn("cloud-hypervisor did not exit after SIGTERM, sending SIGKILL")
|
||||
if err := syscall.Kill(-p.cmd.Process.Pid, syscall.SIGKILL); err != nil {
|
||||
slog.Debug("sigkill failed", "error", err)
|
||||
}
|
||||
@ -177,10 +177,11 @@ func Run(opts ...Option) {
|
||||
Config: cfg,
|
||||
}
|
||||
|
||||
// Host monitor (passive + active reconciliation every 60s).
|
||||
// Created before API server so the heartbeat handler can trigger immediate
|
||||
// reconciliation when a host recovers from unreachable.
|
||||
monitor := api.NewHostMonitor(queries, hostPool, al, 60*time.Second)
|
||||
// Host monitor (safety-net reconciliation every 5 minutes).
|
||||
// Primary state sync is push-based (host agent callbacks + CP background
|
||||
// goroutines). The monitor acts as a fallback for missed events, host death
|
||||
// detection, and transient status resolution.
|
||||
monitor := api.NewHostMonitor(queries, hostPool, al, 5*time.Minute)
|
||||
|
||||
// API server.
|
||||
srv := api.New(queries, hostPool, hostScheduler, pool, rdb, []byte(cfg.JWTSecret), oauthRegistry, cfg.OAuthRedirectURL, ca, al, channelSvc, mailer, o.extensions, sctx, monitor, o.version)
|
||||
|
||||
@ -326,7 +326,7 @@ func (s *BuildService) executeBuild(ctx context.Context, buildIDStr string) {
|
||||
s.failBuild(buildCtx, buildID, fmt.Sprintf("create sandbox failed: %v", err))
|
||||
return
|
||||
}
|
||||
// Capture sandbox metadata (envd/kernel/firecracker/agent versions).
|
||||
// Capture sandbox metadata (envd/kernel/vmm/agent versions).
|
||||
sandboxMetadata := resp.Msg.Metadata
|
||||
|
||||
// Record sandbox/host association.
|
||||
@ -768,7 +768,7 @@ var runtimeEnvVars = map[string]bool{
|
||||
"HOME": true, "USER": true, "LOGNAME": true, "SHELL": true,
|
||||
"PWD": true, "OLDPWD": true, "HOSTNAME": true, "TERM": true,
|
||||
"SHLVL": true, "_": true,
|
||||
// Per-sandbox identifiers set by envd at boot via MMDS.
|
||||
// Per-sandbox identifiers set by envd at boot via PostInit.
|
||||
"WRENN_SANDBOX_ID": true, "WRENN_TEMPLATE_ID": true,
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user