forked from wrenn/wrenn
feat(vm): replace Firecracker with Cloud Hypervisor
Migrate the entire VM layer from Firecracker to Cloud Hypervisor (CH). CH provides native snapshot/restore via its HTTP API, eliminating the need for custom UFFD handling, memfile processing, and snapshot header management that Firecracker required. Key changes: - Remove fc.go, jailer.go (FC process management) - Remove internal/uffd/ package (userfaultfd lazy page loading) - Remove snapshot/header.go, mapping.go, memfile.go (FC snapshot format) - Add ch.go (CH HTTP API client over Unix socket) - Add process.go (CH process lifecycle with unshare+netns) - Add chversion.go (CH version detection) - Refactor sandbox manager: remove UFFD socket tracking, snapshot parent/diff chaining, FC-specific balloon logic; add crash watcher - Simplify snapshot/local.go to CH's native snapshot format - Update VM config: FirecrackerBin → VMMBin, new CH-specific fields - Update envdclient, devicemapper, network for CH compatibility
This commit is contained in:
213
internal/vm/ch.go
Normal file
213
internal/vm/ch.go
Normal file
@ -0,0 +1,213 @@
|
||||
package vm
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net"
|
||||
"net/http"
|
||||
)
|
||||
|
||||
// chClient talks to the Cloud Hypervisor HTTP API over a Unix socket.
|
||||
type chClient struct {
|
||||
http *http.Client
|
||||
socketPath string
|
||||
}
|
||||
|
||||
func newCHClient(socketPath string) *chClient {
|
||||
return &chClient{
|
||||
socketPath: socketPath,
|
||||
http: &http.Client{
|
||||
Transport: &http.Transport{
|
||||
DialContext: func(ctx context.Context, _, _ string) (net.Conn, error) {
|
||||
var d net.Dialer
|
||||
return d.DialContext(ctx, "unix", socketPath)
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (c *chClient) do(ctx context.Context, method, path string, body any) error {
|
||||
var bodyReader io.Reader
|
||||
if body != nil {
|
||||
data, err := json.Marshal(body)
|
||||
if err != nil {
|
||||
return fmt.Errorf("marshal request body: %w", err)
|
||||
}
|
||||
bodyReader = bytes.NewReader(data)
|
||||
}
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, method, "http://localhost"+path, bodyReader)
|
||||
if err != nil {
|
||||
return fmt.Errorf("create request: %w", err)
|
||||
}
|
||||
if body != nil {
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
}
|
||||
|
||||
resp, err := c.http.Do(req)
|
||||
if err != nil {
|
||||
return fmt.Errorf("%s %s: %w", method, path, err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode >= 300 {
|
||||
respBody, _ := io.ReadAll(resp.Body)
|
||||
return fmt.Errorf("%s %s: status %d: %s", method, path, resp.StatusCode, string(respBody))
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// --- CH API payload types ---
|
||||
|
||||
type chPayload struct {
|
||||
Firmware string `json:"firmware,omitempty"`
|
||||
Kernel string `json:"kernel"`
|
||||
Cmdline string `json:"cmdline"`
|
||||
}
|
||||
|
||||
type chCPUs struct {
|
||||
BootVCPUs int `json:"boot_vcpus"`
|
||||
MaxVCPUs int `json:"max_vcpus"`
|
||||
}
|
||||
|
||||
type chMemory struct {
|
||||
Size uint64 `json:"size"`
|
||||
Shared bool `json:"shared,omitempty"`
|
||||
HotplugSize uint64 `json:"hotplug_size,omitempty"`
|
||||
HotplugMethod string `json:"hotplug_method,omitempty"`
|
||||
}
|
||||
|
||||
type chDisk struct {
|
||||
Path string `json:"path"`
|
||||
Readonly bool `json:"readonly,omitempty"`
|
||||
ImageType string `json:"image_type,omitempty"`
|
||||
}
|
||||
|
||||
type chNet struct {
|
||||
Tap string `json:"tap"`
|
||||
MAC string `json:"mac"`
|
||||
NumQs int `json:"num_queues,omitempty"`
|
||||
QueueS int `json:"queue_size,omitempty"`
|
||||
}
|
||||
|
||||
type chBalloon struct {
|
||||
Size int64 `json:"size"`
|
||||
DeflateOnOOM bool `json:"deflate_on_oom"`
|
||||
FreePageRep bool `json:"free_page_reporting,omitempty"`
|
||||
}
|
||||
|
||||
type chConsole struct {
|
||||
Mode string `json:"mode"`
|
||||
}
|
||||
|
||||
type chCreatePayload struct {
|
||||
Payload chPayload `json:"payload"`
|
||||
CPUs chCPUs `json:"cpus"`
|
||||
Memory chMemory `json:"memory"`
|
||||
Disks []chDisk `json:"disks"`
|
||||
Net []chNet `json:"net"`
|
||||
Balloon *chBalloon `json:"balloon,omitempty"`
|
||||
Serial chConsole `json:"serial"`
|
||||
Console chConsole `json:"console"`
|
||||
}
|
||||
|
||||
// createVM sends the full VM configuration as a single payload.
|
||||
func (c *chClient) createVM(ctx context.Context, cfg *VMConfig) error {
|
||||
memBytes := uint64(cfg.MemoryMB) * 1024 * 1024
|
||||
|
||||
payload := chCreatePayload{
|
||||
Payload: chPayload{
|
||||
Kernel: cfg.KernelPath,
|
||||
Cmdline: cfg.kernelArgs(),
|
||||
},
|
||||
CPUs: chCPUs{
|
||||
BootVCPUs: cfg.VCPUs,
|
||||
MaxVCPUs: cfg.VCPUs,
|
||||
},
|
||||
Memory: chMemory{
|
||||
Size: memBytes,
|
||||
Shared: true,
|
||||
},
|
||||
Disks: []chDisk{
|
||||
{
|
||||
Path: cfg.SandboxDir + "/rootfs.ext4",
|
||||
ImageType: "Raw",
|
||||
},
|
||||
},
|
||||
Net: []chNet{
|
||||
{
|
||||
Tap: cfg.TapDevice,
|
||||
MAC: cfg.TapMAC,
|
||||
},
|
||||
},
|
||||
Balloon: &chBalloon{
|
||||
Size: 0,
|
||||
DeflateOnOOM: true,
|
||||
FreePageRep: true,
|
||||
},
|
||||
Serial: chConsole{
|
||||
Mode: "Tty",
|
||||
},
|
||||
Console: chConsole{
|
||||
Mode: "Off",
|
||||
},
|
||||
}
|
||||
|
||||
return c.do(ctx, http.MethodPut, "/api/v1/vm.create", payload)
|
||||
}
|
||||
|
||||
// bootVM starts the VM after creation.
|
||||
func (c *chClient) bootVM(ctx context.Context) error {
|
||||
return c.do(ctx, http.MethodPut, "/api/v1/vm.boot", nil)
|
||||
}
|
||||
|
||||
// pauseVM pauses the microVM.
|
||||
func (c *chClient) pauseVM(ctx context.Context) error {
|
||||
return c.do(ctx, http.MethodPut, "/api/v1/vm.pause", nil)
|
||||
}
|
||||
|
||||
// resumeVM resumes a paused microVM.
|
||||
func (c *chClient) resumeVM(ctx context.Context) error {
|
||||
return c.do(ctx, http.MethodPut, "/api/v1/vm.resume", nil)
|
||||
}
|
||||
|
||||
// snapshotVM creates a VM snapshot to the given directory.
|
||||
func (c *chClient) snapshotVM(ctx context.Context, destURL string) error {
|
||||
return c.do(ctx, http.MethodPut, "/api/v1/vm.snapshot", map[string]string{
|
||||
"destination_url": destURL,
|
||||
})
|
||||
}
|
||||
|
||||
// restoreVM restores a VM from a snapshot via the API. Uses OnDemand memory
|
||||
// restore mode for UFFD-based lazy page loading — only pages the guest
|
||||
// actually touches are faulted in from disk.
|
||||
func (c *chClient) restoreVM(ctx context.Context, sourceURL string) error {
|
||||
return c.do(ctx, http.MethodPut, "/api/v1/vm.restore", map[string]any{
|
||||
"source_url": sourceURL,
|
||||
"memory_restore_mode": "OnDemand",
|
||||
"resume": true,
|
||||
})
|
||||
}
|
||||
|
||||
// shutdownVMM cleanly shuts down the Cloud Hypervisor VMM process.
|
||||
func (c *chClient) shutdownVMM(ctx context.Context) error {
|
||||
return c.do(ctx, http.MethodPut, "/api/v1/vmm.shutdown", nil)
|
||||
}
|
||||
|
||||
// resizeBalloon adjusts the balloon target at runtime.
|
||||
// sizeBytes is memory to take FROM the guest (0 = give all back).
|
||||
func (c *chClient) resizeBalloon(ctx context.Context, sizeBytes int64) error {
|
||||
return c.do(ctx, http.MethodPut, "/api/v1/vm.resize", map[string]int64{
|
||||
"desired_balloon": sizeBytes,
|
||||
})
|
||||
}
|
||||
|
||||
// ping checks if the VMM is alive and ready to accept commands.
|
||||
func (c *chClient) ping(ctx context.Context) error {
|
||||
return c.do(ctx, http.MethodGet, "/api/v1/vmm.ping", nil)
|
||||
}
|
||||
@ -2,13 +2,12 @@ package vm
|
||||
|
||||
import "fmt"
|
||||
|
||||
// VMConfig holds the configuration for creating a Firecracker microVM.
|
||||
// VMConfig holds the configuration for creating a Cloud Hypervisor microVM.
|
||||
type VMConfig struct {
|
||||
// SandboxID is the unique identifier for this sandbox (e.g., "cl-a1b2c3d4").
|
||||
SandboxID string
|
||||
|
||||
// TemplateID is the template UUID string used to populate MMDS metadata
|
||||
// so that envd can read WRENN_TEMPLATE_ID from inside the guest.
|
||||
// TemplateID is the template UUID string, passed to envd via PostInit.
|
||||
TemplateID string
|
||||
|
||||
// KernelPath is the path to the uncompressed Linux kernel (vmlinux).
|
||||
@ -25,12 +24,12 @@ type VMConfig struct {
|
||||
MemoryMB int
|
||||
|
||||
// NetworkNamespace is the name of the network namespace to launch
|
||||
// Firecracker inside (e.g., "ns-1"). The namespace must already exist
|
||||
// Cloud Hypervisor inside (e.g., "ns-1"). The namespace must already exist
|
||||
// with a TAP device configured.
|
||||
NetworkNamespace string
|
||||
|
||||
// TapDevice is the name of the TAP device inside the network namespace
|
||||
// that Firecracker will attach to (e.g., "tap0").
|
||||
// that Cloud Hypervisor will attach to (e.g., "tap0").
|
||||
TapDevice string
|
||||
|
||||
// TapMAC is the MAC address for the TAP device.
|
||||
@ -45,19 +44,23 @@ type VMConfig struct {
|
||||
// NetMask is the subnet mask for the guest network (e.g., "255.255.255.252").
|
||||
NetMask string
|
||||
|
||||
// FirecrackerBin is the path to the firecracker binary.
|
||||
FirecrackerBin string
|
||||
// VMMBin is the path to the cloud-hypervisor binary.
|
||||
VMMBin string
|
||||
|
||||
// SocketPath is the path for the Firecracker API Unix socket.
|
||||
// SocketPath is the path for the Cloud Hypervisor API Unix socket.
|
||||
SocketPath string
|
||||
|
||||
// SandboxDir is the tmpfs mount point for per-sandbox files inside the
|
||||
// mount namespace (e.g., "/fc-vm").
|
||||
// mount namespace (e.g., "/ch-vm").
|
||||
SandboxDir string
|
||||
|
||||
// InitPath is the path to the init process inside the guest.
|
||||
// Defaults to "/sbin/init" if empty.
|
||||
InitPath string
|
||||
|
||||
// SnapshotDir is the path to the snapshot directory for restore.
|
||||
// Only set when restoring from a snapshot.
|
||||
SnapshotDir string
|
||||
}
|
||||
|
||||
func (c *VMConfig) applyDefaults() {
|
||||
@ -67,14 +70,14 @@ func (c *VMConfig) applyDefaults() {
|
||||
if c.MemoryMB == 0 {
|
||||
c.MemoryMB = 512
|
||||
}
|
||||
if c.FirecrackerBin == "" {
|
||||
c.FirecrackerBin = "/usr/local/bin/firecracker"
|
||||
if c.VMMBin == "" {
|
||||
c.VMMBin = "/usr/local/bin/cloud-hypervisor"
|
||||
}
|
||||
if c.SocketPath == "" {
|
||||
c.SocketPath = fmt.Sprintf("/tmp/fc-%s.sock", c.SandboxID)
|
||||
c.SocketPath = fmt.Sprintf("/tmp/ch-%s.sock", c.SandboxID)
|
||||
}
|
||||
if c.SandboxDir == "" {
|
||||
c.SandboxDir = "/tmp/fc-vm"
|
||||
c.SandboxDir = "/tmp/ch-vm"
|
||||
}
|
||||
if c.TapDevice == "" {
|
||||
c.TapDevice = "tap0"
|
||||
@ -95,7 +98,7 @@ func (c *VMConfig) kernelArgs() string {
|
||||
)
|
||||
|
||||
return fmt.Sprintf(
|
||||
"console=ttyS0 reboot=k panic=1 pci=off quiet loglevel=1 clocksource=kvm-clock init=%s %s",
|
||||
"console=ttyS0 root=/dev/vda rw reboot=k panic=1 quiet loglevel=1 init_on_free=1 clocksource=kvm-clock init=%s %s",
|
||||
c.InitPath, ipArg,
|
||||
)
|
||||
}
|
||||
|
||||
@ -1,202 +0,0 @@
|
||||
package vm
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net"
|
||||
"net/http"
|
||||
)
|
||||
|
||||
// fcClient talks to the Firecracker HTTP API over a Unix socket.
|
||||
type fcClient struct {
|
||||
http *http.Client
|
||||
socketPath string
|
||||
}
|
||||
|
||||
func newFCClient(socketPath string) *fcClient {
|
||||
return &fcClient{
|
||||
socketPath: socketPath,
|
||||
http: &http.Client{
|
||||
Transport: &http.Transport{
|
||||
DialContext: func(ctx context.Context, _, _ string) (net.Conn, error) {
|
||||
var d net.Dialer
|
||||
return d.DialContext(ctx, "unix", socketPath)
|
||||
},
|
||||
},
|
||||
// No global timeout — callers pass context.Context with appropriate
|
||||
// deadlines. A fixed 10s timeout was too short for snapshot/resume
|
||||
// operations on large-memory VMs (20GB+ memfiles).
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (c *fcClient) do(ctx context.Context, method, path string, body any) error {
|
||||
var bodyReader io.Reader
|
||||
if body != nil {
|
||||
data, err := json.Marshal(body)
|
||||
if err != nil {
|
||||
return fmt.Errorf("marshal request body: %w", err)
|
||||
}
|
||||
bodyReader = bytes.NewReader(data)
|
||||
}
|
||||
|
||||
// The host in the URL is ignored for Unix sockets; we use "localhost" by convention.
|
||||
req, err := http.NewRequestWithContext(ctx, method, "http://localhost"+path, bodyReader)
|
||||
if err != nil {
|
||||
return fmt.Errorf("create request: %w", err)
|
||||
}
|
||||
if body != nil {
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
}
|
||||
|
||||
resp, err := c.http.Do(req)
|
||||
if err != nil {
|
||||
return fmt.Errorf("%s %s: %w", method, path, err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode >= 300 {
|
||||
respBody, _ := io.ReadAll(resp.Body)
|
||||
return fmt.Errorf("%s %s: status %d: %s", method, path, resp.StatusCode, string(respBody))
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// setBootSource configures the kernel and boot args.
|
||||
func (c *fcClient) setBootSource(ctx context.Context, kernelPath, bootArgs string) error {
|
||||
return c.do(ctx, http.MethodPut, "/boot-source", map[string]string{
|
||||
"kernel_image_path": kernelPath,
|
||||
"boot_args": bootArgs,
|
||||
})
|
||||
}
|
||||
|
||||
// setRootfsDrive configures the root filesystem drive.
|
||||
func (c *fcClient) setRootfsDrive(ctx context.Context, driveID, path string, readOnly bool) error {
|
||||
return c.do(ctx, http.MethodPut, "/drives/"+driveID, map[string]any{
|
||||
"drive_id": driveID,
|
||||
"path_on_host": path,
|
||||
"is_root_device": true,
|
||||
"is_read_only": readOnly,
|
||||
})
|
||||
}
|
||||
|
||||
// setNetworkInterface configures a network interface attached to a TAP device.
|
||||
// A tx_rate_limiter caps sustained guest→host throughput to prevent user
|
||||
// application traffic from completely saturating the TAP device and starving
|
||||
// envd control traffic (PTY, exec, file ops).
|
||||
func (c *fcClient) setNetworkInterface(ctx context.Context, ifaceID, tapName, macAddr string) error {
|
||||
return c.do(ctx, http.MethodPut, "/network-interfaces/"+ifaceID, map[string]any{
|
||||
"iface_id": ifaceID,
|
||||
"host_dev_name": tapName,
|
||||
"guest_mac": macAddr,
|
||||
"tx_rate_limiter": map[string]any{
|
||||
"bandwidth": map[string]any{
|
||||
"size": 209715200, // 200 MB/s sustained
|
||||
"refill_time": 1000, // refill period: 1 second
|
||||
"one_time_burst": 104857600, // 100 MB initial burst
|
||||
},
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
// setMachineConfig configures vCPUs, memory, and other machine settings.
|
||||
func (c *fcClient) setMachineConfig(ctx context.Context, vcpus, memMB int) error {
|
||||
return c.do(ctx, http.MethodPut, "/machine-config", map[string]any{
|
||||
"vcpu_count": vcpus,
|
||||
"mem_size_mib": memMB,
|
||||
"smt": false,
|
||||
})
|
||||
}
|
||||
|
||||
// setMMDSConfig enables MMDS V2 token-based access on the given network interface.
|
||||
// Must be called before startVM.
|
||||
func (c *fcClient) setMMDSConfig(ctx context.Context, ifaceID string) error {
|
||||
return c.do(ctx, http.MethodPut, "/mmds/config", map[string]any{
|
||||
"version": "V2",
|
||||
"network_interfaces": []string{ifaceID},
|
||||
})
|
||||
}
|
||||
|
||||
// mmdsMetadata is the metadata payload written to the Firecracker MMDS store.
|
||||
// envd reads this via PollForMMDSOpts to populate WRENN_SANDBOX_ID and WRENN_TEMPLATE_ID.
|
||||
type mmdsMetadata struct {
|
||||
SandboxID string `json:"instanceID"`
|
||||
TemplateID string `json:"envID"`
|
||||
}
|
||||
|
||||
// setMMDS writes sandbox metadata to the Firecracker MMDS store.
|
||||
// Can be called after the VM has started.
|
||||
func (c *fcClient) setMMDS(ctx context.Context, sandboxID, templateID string) error {
|
||||
return c.do(ctx, http.MethodPut, "/mmds", mmdsMetadata{
|
||||
SandboxID: sandboxID,
|
||||
TemplateID: templateID,
|
||||
})
|
||||
}
|
||||
|
||||
// setBalloon configures the Firecracker balloon device for dynamic memory
|
||||
// management. deflateOnOom lets the guest reclaim balloon pages under memory
|
||||
// pressure. statsInterval enables periodic stats via GET /balloon/statistics.
|
||||
// Must be called before startVM.
|
||||
func (c *fcClient) setBalloon(ctx context.Context, amountMiB int, deflateOnOom bool, statsIntervalS int) error {
|
||||
return c.do(ctx, http.MethodPut, "/balloon", map[string]any{
|
||||
"amount_mib": amountMiB,
|
||||
"deflate_on_oom": deflateOnOom,
|
||||
"stats_polling_interval_s": statsIntervalS,
|
||||
})
|
||||
}
|
||||
|
||||
// updateBalloon adjusts the balloon target at runtime.
|
||||
func (c *fcClient) updateBalloon(ctx context.Context, amountMiB int) error {
|
||||
return c.do(ctx, http.MethodPatch, "/balloon", map[string]any{
|
||||
"amount_mib": amountMiB,
|
||||
})
|
||||
}
|
||||
|
||||
// startVM issues the InstanceStart action.
|
||||
func (c *fcClient) startVM(ctx context.Context) error {
|
||||
return c.do(ctx, http.MethodPut, "/actions", map[string]string{
|
||||
"action_type": "InstanceStart",
|
||||
})
|
||||
}
|
||||
|
||||
// pauseVM pauses the microVM.
|
||||
func (c *fcClient) pauseVM(ctx context.Context) error {
|
||||
return c.do(ctx, http.MethodPatch, "/vm", map[string]string{
|
||||
"state": "Paused",
|
||||
})
|
||||
}
|
||||
|
||||
// resumeVM resumes a paused microVM.
|
||||
func (c *fcClient) resumeVM(ctx context.Context) error {
|
||||
return c.do(ctx, http.MethodPatch, "/vm", map[string]string{
|
||||
"state": "Resumed",
|
||||
})
|
||||
}
|
||||
|
||||
// createSnapshot creates a VM snapshot.
|
||||
// snapshotType is "Full" (all memory) or "Diff" (only dirty pages since last resume).
|
||||
func (c *fcClient) createSnapshot(ctx context.Context, snapPath, memPath, snapshotType string) error {
|
||||
return c.do(ctx, http.MethodPut, "/snapshot/create", map[string]any{
|
||||
"snapshot_type": snapshotType,
|
||||
"snapshot_path": snapPath,
|
||||
"mem_file_path": memPath,
|
||||
})
|
||||
}
|
||||
|
||||
// loadSnapshotWithUffd loads a VM snapshot using a UFFD socket for
|
||||
// lazy memory loading. Firecracker will connect to the socket and
|
||||
// send the uffd fd + memory region mappings.
|
||||
func (c *fcClient) loadSnapshotWithUffd(ctx context.Context, snapPath, uffdSocketPath string) error {
|
||||
return c.do(ctx, http.MethodPut, "/snapshot/load", map[string]any{
|
||||
"snapshot_path": snapPath,
|
||||
"resume_vm": false,
|
||||
"mem_backend": map[string]any{
|
||||
"backend_type": "Uffd",
|
||||
"backend_path": uffdSocketPath,
|
||||
},
|
||||
})
|
||||
}
|
||||
@ -9,14 +9,14 @@ import (
|
||||
"time"
|
||||
)
|
||||
|
||||
// VM represents a running Firecracker microVM.
|
||||
// VM represents a running Cloud Hypervisor microVM.
|
||||
type VM struct {
|
||||
Config VMConfig
|
||||
process *process
|
||||
client *fcClient
|
||||
client *chClient
|
||||
}
|
||||
|
||||
// Manager handles the lifecycle of Firecracker microVMs.
|
||||
// Manager handles the lifecycle of Cloud Hypervisor microVMs.
|
||||
type Manager struct {
|
||||
mu sync.RWMutex
|
||||
// vms tracks running VMs by sandbox ID.
|
||||
@ -30,7 +30,7 @@ func NewManager() *Manager {
|
||||
}
|
||||
}
|
||||
|
||||
// Create boots a new Firecracker microVM with the given configuration.
|
||||
// Create boots a new Cloud Hypervisor microVM with the given configuration.
|
||||
// The network namespace and TAP device must already be set up.
|
||||
func (m *Manager) Create(ctx context.Context, cfg VMConfig) (*VM, error) {
|
||||
cfg.applyDefaults()
|
||||
@ -38,7 +38,6 @@ func (m *Manager) Create(ctx context.Context, cfg VMConfig) (*VM, error) {
|
||||
return nil, fmt.Errorf("invalid config: %w", err)
|
||||
}
|
||||
|
||||
// Clean up any leftover socket from a previous run.
|
||||
os.Remove(cfg.SocketPath)
|
||||
|
||||
slog.Info("creating VM",
|
||||
@ -47,7 +46,7 @@ func (m *Manager) Create(ctx context.Context, cfg VMConfig) (*VM, error) {
|
||||
"memory_mb", cfg.MemoryMB,
|
||||
)
|
||||
|
||||
// Step 1: Launch the Firecracker process.
|
||||
// Step 1: Launch the Cloud Hypervisor process.
|
||||
proc, err := startProcess(ctx, &cfg)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("start process: %w", err)
|
||||
@ -59,25 +58,18 @@ func (m *Manager) Create(ctx context.Context, cfg VMConfig) (*VM, error) {
|
||||
return nil, fmt.Errorf("wait for socket: %w", err)
|
||||
}
|
||||
|
||||
// Step 3: Configure the VM via the Firecracker API.
|
||||
client := newFCClient(cfg.SocketPath)
|
||||
// Step 3: Configure and boot the VM via a single API call.
|
||||
client := newCHClient(cfg.SocketPath)
|
||||
|
||||
if err := configureVM(ctx, client, &cfg); err != nil {
|
||||
if err := client.createVM(ctx, &cfg); err != nil {
|
||||
_ = proc.stop()
|
||||
return nil, fmt.Errorf("configure VM: %w", err)
|
||||
return nil, fmt.Errorf("create VM config: %w", err)
|
||||
}
|
||||
|
||||
// Step 4: Start the VM.
|
||||
if err := client.startVM(ctx); err != nil {
|
||||
// Step 4: Boot the VM.
|
||||
if err := client.bootVM(ctx); err != nil {
|
||||
_ = proc.stop()
|
||||
return nil, fmt.Errorf("start VM: %w", err)
|
||||
}
|
||||
|
||||
// Step 5: Push sandbox metadata into MMDS so envd can read
|
||||
// WRENN_SANDBOX_ID and WRENN_TEMPLATE_ID from inside the guest.
|
||||
if err := client.setMMDS(ctx, cfg.SandboxID, cfg.TemplateID); err != nil {
|
||||
_ = proc.stop()
|
||||
return nil, fmt.Errorf("set MMDS metadata: %w", err)
|
||||
return nil, fmt.Errorf("boot VM: %w", err)
|
||||
}
|
||||
|
||||
vm := &VM{
|
||||
@ -95,46 +87,6 @@ func (m *Manager) Create(ctx context.Context, cfg VMConfig) (*VM, error) {
|
||||
return vm, nil
|
||||
}
|
||||
|
||||
// configureVM sends the configuration to Firecracker via its HTTP API.
|
||||
func configureVM(ctx context.Context, client *fcClient, cfg *VMConfig) error {
|
||||
// Boot source (kernel + args)
|
||||
if err := client.setBootSource(ctx, cfg.KernelPath, cfg.kernelArgs()); err != nil {
|
||||
return fmt.Errorf("set boot source: %w", err)
|
||||
}
|
||||
|
||||
// Root drive — use the symlink path inside the mount namespace so that
|
||||
// snapshots record a stable path that works on restore.
|
||||
rootfsSymlink := cfg.SandboxDir + "/rootfs.ext4"
|
||||
if err := client.setRootfsDrive(ctx, "rootfs", rootfsSymlink, false); err != nil {
|
||||
return fmt.Errorf("set rootfs drive: %w", err)
|
||||
}
|
||||
|
||||
// Network interface
|
||||
if err := client.setNetworkInterface(ctx, "eth0", cfg.TapDevice, cfg.TapMAC); err != nil {
|
||||
return fmt.Errorf("set network interface: %w", err)
|
||||
}
|
||||
|
||||
// Machine config (vCPUs + memory)
|
||||
if err := client.setMachineConfig(ctx, cfg.VCPUs, cfg.MemoryMB); err != nil {
|
||||
return fmt.Errorf("set machine config: %w", err)
|
||||
}
|
||||
|
||||
// Balloon device — allows the host to reclaim unused guest memory.
|
||||
// Start with 0 (no inflation). deflate_on_oom lets the guest reclaim
|
||||
// balloon pages under memory pressure. Stats interval enables monitoring.
|
||||
if err := client.setBalloon(ctx, 0, true, 5); err != nil {
|
||||
slog.Warn("set balloon failed (non-fatal, VM will run without memory reclaim)", "error", err)
|
||||
}
|
||||
|
||||
// MMDS config — enable V2 token access on eth0 so that envd can read
|
||||
// WRENN_SANDBOX_ID and WRENN_TEMPLATE_ID from inside the guest.
|
||||
if err := client.setMMDSConfig(ctx, "eth0"); err != nil {
|
||||
return fmt.Errorf("set MMDS config: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Pause pauses a running VM.
|
||||
func (m *Manager) Pause(ctx context.Context, sandboxID string) error {
|
||||
m.mu.RLock()
|
||||
@ -179,7 +131,8 @@ func (m *Manager) UpdateBalloon(ctx context.Context, sandboxID string, amountMiB
|
||||
return fmt.Errorf("VM not found: %s", sandboxID)
|
||||
}
|
||||
|
||||
return vm.client.updateBalloon(ctx, amountMiB)
|
||||
sizeBytes := int64(amountMiB) * 1024 * 1024
|
||||
return vm.client.resizeBalloon(ctx, sizeBytes)
|
||||
}
|
||||
|
||||
// Destroy stops and cleans up a VM.
|
||||
@ -195,12 +148,17 @@ func (m *Manager) Destroy(ctx context.Context, sandboxID string) error {
|
||||
|
||||
slog.Info("destroying VM", "sandbox", sandboxID)
|
||||
|
||||
// Stop the Firecracker process.
|
||||
// Try clean shutdown first, fall back to process kill.
|
||||
shutdownCtx, shutdownCancel := context.WithTimeout(ctx, 5*time.Second)
|
||||
if err := vm.client.shutdownVMM(shutdownCtx); err != nil {
|
||||
slog.Debug("clean VMM shutdown failed, killing process", "sandbox", sandboxID, "error", err)
|
||||
}
|
||||
shutdownCancel()
|
||||
|
||||
if err := vm.process.stop(); err != nil {
|
||||
slog.Warn("error stopping process", "sandbox", sandboxID, "error", err)
|
||||
}
|
||||
|
||||
// Clean up the API socket.
|
||||
os.Remove(vm.Config.SocketPath)
|
||||
|
||||
slog.Info("VM destroyed", "sandbox", sandboxID)
|
||||
@ -208,8 +166,8 @@ func (m *Manager) Destroy(ctx context.Context, sandboxID string) error {
|
||||
}
|
||||
|
||||
// Snapshot creates a VM snapshot. The VM must already be paused.
|
||||
// snapshotType is "Full" (all memory) or "Diff" (only dirty pages since last resume).
|
||||
func (m *Manager) Snapshot(ctx context.Context, sandboxID, snapPath, memPath, snapshotType string) error {
|
||||
// destURL is the file:// URL to the snapshot directory.
|
||||
func (m *Manager) Snapshot(ctx context.Context, sandboxID, snapshotDir string) error {
|
||||
m.mu.RLock()
|
||||
vm, ok := m.vms[sandboxID]
|
||||
m.mu.RUnlock()
|
||||
@ -217,29 +175,35 @@ func (m *Manager) Snapshot(ctx context.Context, sandboxID, snapPath, memPath, sn
|
||||
return fmt.Errorf("VM not found: %s", sandboxID)
|
||||
}
|
||||
|
||||
if err := vm.client.createSnapshot(ctx, snapPath, memPath, snapshotType); err != nil {
|
||||
destURL := "file://" + snapshotDir
|
||||
if err := vm.client.snapshotVM(ctx, destURL); err != nil {
|
||||
return fmt.Errorf("create snapshot: %w", err)
|
||||
}
|
||||
|
||||
slog.Info("VM snapshot created", "sandbox", sandboxID, "snap_path", snapPath, "type", snapshotType)
|
||||
slog.Info("VM snapshot created", "sandbox", sandboxID, "snapshot_dir", snapshotDir)
|
||||
return nil
|
||||
}
|
||||
|
||||
// CreateFromSnapshot boots a new Firecracker VM by loading a snapshot
|
||||
// using UFFD for lazy memory loading. The network namespace and TAP
|
||||
// device must already be set up.
|
||||
// CreateFromSnapshot boots a new Cloud Hypervisor VM by restoring from a
|
||||
// snapshot directory. The network namespace and TAP device must already be set up.
|
||||
//
|
||||
// No boot resources (kernel, drives, machine config) are configured —
|
||||
// the snapshot carries all that state. The rootfs path recorded in the
|
||||
// snapshot is resolved via a stable symlink at SandboxDir/rootfs.ext4
|
||||
// inside the mount namespace (created by the start script in jailer.go).
|
||||
// A bare CH process is started first, then the restore is performed via the API
|
||||
// with memory_restore_mode=OnDemand for UFFD-based lazy page loading. This means
|
||||
// only pages the guest actually touches are faulted in from disk — a 16GB template
|
||||
// with 2GB active working set only loads ~2GB into RAM at restore time.
|
||||
//
|
||||
// The restore API also sets resume=true, so the VM starts running immediately
|
||||
// without a separate resume call.
|
||||
//
|
||||
// The rootfs path recorded in the snapshot is resolved via a stable symlink at
|
||||
// SandboxDir/rootfs.ext4 inside the mount namespace.
|
||||
//
|
||||
// The sequence is:
|
||||
// 1. Start FC process in mount+network namespace (creates tmpfs + rootfs symlink)
|
||||
// 1. Start bare CH process in mount+network namespace
|
||||
// 2. Wait for API socket
|
||||
// 3. Load snapshot with UFFD backend
|
||||
// 4. Resume VM execution
|
||||
func (m *Manager) CreateFromSnapshot(ctx context.Context, cfg VMConfig, snapPath, uffdSocketPath string) (*VM, error) {
|
||||
// 3. Restore VM via API (OnDemand memory + auto-resume)
|
||||
func (m *Manager) CreateFromSnapshot(ctx context.Context, cfg VMConfig, snapshotDir string) (*VM, error) {
|
||||
cfg.SnapshotDir = snapshotDir
|
||||
cfg.applyDefaults()
|
||||
if err := cfg.validate(); err != nil {
|
||||
return nil, fmt.Errorf("invalid config: %w", err)
|
||||
@ -249,14 +213,11 @@ func (m *Manager) CreateFromSnapshot(ctx context.Context, cfg VMConfig, snapPath
|
||||
|
||||
slog.Info("restoring VM from snapshot",
|
||||
"sandbox", cfg.SandboxID,
|
||||
"snap_path", snapPath,
|
||||
"snapshot_dir", snapshotDir,
|
||||
)
|
||||
|
||||
// Step 1: Launch the Firecracker process.
|
||||
// The start script creates a tmpfs at SandboxDir and symlinks
|
||||
// rootfs.ext4 → cfg.RootfsPath, so the snapshot's recorded rootfs
|
||||
// path (/fc-vm/rootfs.ext4) resolves to the new clone.
|
||||
proc, err := startProcess(ctx, &cfg)
|
||||
// Step 1: Launch bare CH process (no --restore).
|
||||
proc, err := startProcessForRestore(ctx, &cfg)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("start process: %w", err)
|
||||
}
|
||||
@ -267,26 +228,13 @@ func (m *Manager) CreateFromSnapshot(ctx context.Context, cfg VMConfig, snapPath
|
||||
return nil, fmt.Errorf("wait for socket: %w", err)
|
||||
}
|
||||
|
||||
client := newFCClient(cfg.SocketPath)
|
||||
client := newCHClient(cfg.SocketPath)
|
||||
|
||||
// Step 3: Load the snapshot with UFFD backend.
|
||||
// No boot resources are configured — the snapshot carries kernel,
|
||||
// drive, network, and machine config state.
|
||||
if err := client.loadSnapshotWithUffd(ctx, snapPath, uffdSocketPath); err != nil {
|
||||
// Step 3: Restore via API with OnDemand memory + auto-resume.
|
||||
sourceURL := "file://" + snapshotDir
|
||||
if err := client.restoreVM(ctx, sourceURL); err != nil {
|
||||
_ = proc.stop()
|
||||
return nil, fmt.Errorf("load snapshot: %w", err)
|
||||
}
|
||||
|
||||
// Step 4: Resume the VM.
|
||||
if err := client.resumeVM(ctx); err != nil {
|
||||
_ = proc.stop()
|
||||
return nil, fmt.Errorf("resume VM: %w", err)
|
||||
}
|
||||
|
||||
// Step 5: Push sandbox metadata into MMDS.
|
||||
if err := client.setMMDS(ctx, cfg.SandboxID, cfg.TemplateID); err != nil {
|
||||
_ = proc.stop()
|
||||
return nil, fmt.Errorf("set MMDS metadata: %w", err)
|
||||
return nil, fmt.Errorf("restore VM: %w", err)
|
||||
}
|
||||
|
||||
vm := &VM{
|
||||
@ -304,11 +252,15 @@ func (m *Manager) CreateFromSnapshot(ctx context.Context, cfg VMConfig, snapPath
|
||||
}
|
||||
|
||||
// PID returns the process ID of the unshare wrapper process.
|
||||
// The actual Firecracker process is a direct child of this PID.
|
||||
func (v *VM) PID() int {
|
||||
return v.process.cmd.Process.Pid
|
||||
}
|
||||
|
||||
// Exited returns a channel that is closed when the VM process exits.
|
||||
func (v *VM) Exited() <-chan struct{} {
|
||||
return v.process.exited()
|
||||
}
|
||||
|
||||
// Get returns a running VM by sandbox ID.
|
||||
func (m *Manager) Get(sandboxID string) (*VM, bool) {
|
||||
m.mu.RLock()
|
||||
@ -317,7 +269,7 @@ func (m *Manager) Get(sandboxID string) (*VM, bool) {
|
||||
return vm, ok
|
||||
}
|
||||
|
||||
// waitForSocket polls for the Firecracker API socket to appear on disk.
|
||||
// waitForSocket polls for the Cloud Hypervisor API socket to appear on disk.
|
||||
func waitForSocket(ctx context.Context, socketPath string, proc *process) error {
|
||||
ticker := time.NewTicker(10 * time.Millisecond)
|
||||
defer ticker.Stop()
|
||||
@ -329,7 +281,7 @@ func waitForSocket(ctx context.Context, socketPath string, proc *process) error
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case <-proc.exited():
|
||||
return fmt.Errorf("firecracker process exited before socket was ready")
|
||||
return fmt.Errorf("cloud-hypervisor process exited before socket was ready")
|
||||
case <-timeout:
|
||||
return fmt.Errorf("timed out waiting for API socket at %s", socketPath)
|
||||
case <-ticker.C:
|
||||
|
||||
@ -10,7 +10,7 @@ import (
|
||||
"time"
|
||||
)
|
||||
|
||||
// process represents a running Firecracker process with mount and network
|
||||
// process represents a running Cloud Hypervisor process with mount and network
|
||||
// namespace isolation.
|
||||
type process struct {
|
||||
cmd *exec.Cmd
|
||||
@ -20,33 +20,42 @@ type process struct {
|
||||
exitErr error
|
||||
}
|
||||
|
||||
// startProcess launches the Firecracker binary inside an isolated mount namespace
|
||||
// and the specified network namespace. The launch sequence:
|
||||
// startProcess launches the Cloud Hypervisor binary inside an isolated mount
|
||||
// namespace and the specified network namespace. Used for fresh boot (no
|
||||
// snapshot). The launch sequence:
|
||||
//
|
||||
// 1. unshare -m: creates a private mount namespace
|
||||
// 2. mount --make-rprivate /: prevents mount propagation to host
|
||||
// 3. mount tmpfs at SandboxDir: ephemeral workspace for this VM
|
||||
// 4. symlink kernel and rootfs into SandboxDir
|
||||
// 5. ip netns exec <ns>: enters the network namespace where TAP is configured
|
||||
// 6. exec firecracker with the API socket path
|
||||
// 6. exec cloud-hypervisor with the API socket path
|
||||
func startProcess(ctx context.Context, cfg *VMConfig) (*process, error) {
|
||||
// Use a background context for the long-lived Firecracker process.
|
||||
// The request context (ctx) is only used for the startup phase — we must
|
||||
// not tie the VM's lifetime to the HTTP request that created it.
|
||||
execCtx, cancel := context.WithCancel(context.Background())
|
||||
|
||||
script := buildStartScript(cfg)
|
||||
return launchScript(script, cfg)
|
||||
}
|
||||
|
||||
// startProcessForRestore launches a bare Cloud Hypervisor process (no --restore).
|
||||
// The restore is performed via the API after the socket is ready, which allows
|
||||
// passing memory_restore_mode=OnDemand for UFFD lazy paging.
|
||||
func startProcessForRestore(ctx context.Context, cfg *VMConfig) (*process, error) {
|
||||
script := buildRestoreScript(cfg)
|
||||
return launchScript(script, cfg)
|
||||
}
|
||||
|
||||
func launchScript(script string, cfg *VMConfig) (*process, error) {
|
||||
execCtx, cancel := context.WithCancel(context.Background())
|
||||
|
||||
cmd := exec.CommandContext(execCtx, "unshare", "-m", "--", "bash", "-c", script)
|
||||
cmd.SysProcAttr = &syscall.SysProcAttr{
|
||||
Setsid: true, // new session so signals don't propagate from parent
|
||||
Setsid: true,
|
||||
}
|
||||
cmd.Stdout = os.Stdout
|
||||
cmd.Stderr = os.Stderr
|
||||
|
||||
if err := cmd.Start(); err != nil {
|
||||
cancel()
|
||||
return nil, fmt.Errorf("start firecracker process: %w", err)
|
||||
return nil, fmt.Errorf("start cloud-hypervisor process: %w", err)
|
||||
}
|
||||
|
||||
p := &process{
|
||||
@ -60,7 +69,7 @@ func startProcess(ctx context.Context, cfg *VMConfig) (*process, error) {
|
||||
close(p.exitCh)
|
||||
}()
|
||||
|
||||
slog.Info("firecracker process started",
|
||||
slog.Info("cloud-hypervisor process started",
|
||||
"pid", cmd.Process.Pid,
|
||||
"sandbox", cfg.SandboxID,
|
||||
)
|
||||
@ -68,35 +77,56 @@ func startProcess(ctx context.Context, cfg *VMConfig) (*process, error) {
|
||||
return p, nil
|
||||
}
|
||||
|
||||
// buildStartScript generates the bash script that sets up the mount namespace,
|
||||
// symlinks kernel/rootfs, and execs Firecracker inside the network namespace.
|
||||
// buildStartScript generates the bash script for fresh boot: sets up mount
|
||||
// namespace, symlinks kernel/rootfs, and execs Cloud Hypervisor.
|
||||
func buildStartScript(cfg *VMConfig) string {
|
||||
return fmt.Sprintf(`
|
||||
set -euo pipefail
|
||||
|
||||
# Prevent mount propagation to the host
|
||||
mount --make-rprivate /
|
||||
|
||||
# Create ephemeral tmpfs workspace
|
||||
mkdir -p %[1]s
|
||||
mount -t tmpfs tmpfs %[1]s
|
||||
|
||||
# Symlink kernel and rootfs into the workspace
|
||||
ln -s %[2]s %[1]s/vmlinux
|
||||
ln -s %[3]s %[1]s/rootfs.ext4
|
||||
|
||||
# Launch Firecracker inside the network namespace
|
||||
exec ip netns exec %[4]s %[5]s --api-sock %[6]s
|
||||
exec ip netns exec %[4]s %[5]s --api-socket path=%[6]s
|
||||
`,
|
||||
cfg.SandboxDir, // 1
|
||||
cfg.KernelPath, // 2
|
||||
cfg.RootfsPath, // 3
|
||||
cfg.NetworkNamespace, // 4
|
||||
cfg.FirecrackerBin, // 5
|
||||
cfg.VMMBin, // 5
|
||||
cfg.SocketPath, // 6
|
||||
)
|
||||
}
|
||||
|
||||
// buildRestoreScript generates the bash script for snapshot restore: sets up
|
||||
// mount namespace, symlinks rootfs, and starts a bare Cloud Hypervisor process.
|
||||
// The actual restore is done via the API (PUT /vm.restore) after the socket is
|
||||
// ready, which enables memory_restore_mode=OnDemand for UFFD lazy paging.
|
||||
func buildRestoreScript(cfg *VMConfig) string {
|
||||
return fmt.Sprintf(`
|
||||
set -euo pipefail
|
||||
|
||||
mount --make-rprivate /
|
||||
|
||||
mkdir -p %[1]s
|
||||
mount -t tmpfs tmpfs %[1]s
|
||||
|
||||
ln -s %[2]s %[1]s/rootfs.ext4
|
||||
|
||||
exec ip netns exec %[3]s %[4]s --api-socket path=%[5]s
|
||||
`,
|
||||
cfg.SandboxDir, // 1
|
||||
cfg.RootfsPath, // 2
|
||||
cfg.NetworkNamespace, // 3
|
||||
cfg.VMMBin, // 4
|
||||
cfg.SocketPath, // 5
|
||||
)
|
||||
}
|
||||
|
||||
// stop sends SIGTERM and waits for the process to exit. If it doesn't exit
|
||||
// within 10 seconds, SIGKILL is sent.
|
||||
func (p *process) stop() error {
|
||||
@ -104,7 +134,6 @@ func (p *process) stop() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Send SIGTERM to the process group (negative PID).
|
||||
if err := syscall.Kill(-p.cmd.Process.Pid, syscall.SIGTERM); err != nil {
|
||||
slog.Debug("sigterm failed, process may have exited", "error", err)
|
||||
}
|
||||
@ -113,7 +142,7 @@ func (p *process) stop() error {
|
||||
case <-p.exitCh:
|
||||
return nil
|
||||
case <-time.After(10 * time.Second):
|
||||
slog.Warn("firecracker did not exit after SIGTERM, sending SIGKILL")
|
||||
slog.Warn("cloud-hypervisor did not exit after SIGTERM, sending SIGKILL")
|
||||
if err := syscall.Kill(-p.cmd.Process.Pid, syscall.SIGKILL); err != nil {
|
||||
slog.Debug("sigkill failed", "error", err)
|
||||
}
|
||||
Reference in New Issue
Block a user