1
0
forked from wrenn/wrenn
Co-authored-by: Tasnim Kabir Sadik <tksadik@omukk.dev>

Reviewed-on: wrenn/wrenn#50
This commit is contained in:
2026-05-24 21:10:37 +00:00
parent 4707f16c76
commit 05ddf62399
203 changed files with 15815 additions and 9344 deletions

232
internal/vm/ch.go Normal file
View File

@ -0,0 +1,232 @@
package vm
import (
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"net"
"net/http"
)
// chClient talks to the Cloud Hypervisor HTTP API over a Unix socket.
type chClient struct {
http *http.Client
socketPath string
}
func newCHClient(socketPath string) *chClient {
return &chClient{
socketPath: socketPath,
http: &http.Client{
Transport: &http.Transport{
DialContext: func(ctx context.Context, _, _ string) (net.Conn, error) {
var d net.Dialer
return d.DialContext(ctx, "unix", socketPath)
},
},
},
}
}
func (c *chClient) do(ctx context.Context, method, path string, body any) error {
return c.doJSON(ctx, method, path, body, nil)
}
// doJSON sends a request and optionally decodes a JSON response into out.
// out may be nil if the response body should be discarded.
func (c *chClient) doJSON(ctx context.Context, method, path string, body, out any) error {
var bodyReader io.Reader
if body != nil {
data, err := json.Marshal(body)
if err != nil {
return fmt.Errorf("marshal request body: %w", err)
}
bodyReader = bytes.NewReader(data)
}
req, err := http.NewRequestWithContext(ctx, method, "http://localhost"+path, bodyReader)
if err != nil {
return fmt.Errorf("create request: %w", err)
}
if body != nil {
req.Header.Set("Content-Type", "application/json")
}
resp, err := c.http.Do(req)
if err != nil {
return fmt.Errorf("%s %s: %w", method, path, err)
}
defer resp.Body.Close()
if resp.StatusCode >= 300 {
respBody, _ := io.ReadAll(resp.Body)
return fmt.Errorf("%s %s: status %d: %s", method, path, resp.StatusCode, string(respBody))
}
if out != nil {
if err := json.NewDecoder(resp.Body).Decode(out); err != nil {
return fmt.Errorf("%s %s: decode response: %w", method, path, err)
}
}
return nil
}
func boolPtr(b bool) *bool { return &b }
// --- CH API payload types ---
type chPayload struct {
Firmware string `json:"firmware,omitempty"`
Kernel string `json:"kernel"`
Cmdline string `json:"cmdline"`
}
type chCPUs struct {
BootVCPUs int `json:"boot_vcpus"`
MaxVCPUs int `json:"max_vcpus"`
}
type chMemory struct {
Size uint64 `json:"size"`
Shared bool `json:"shared,omitempty"`
// Thp uses a pointer with NO omitempty so explicit false is always
// serialized (CH defaults to true). Must be false so the backing memfile
// remains 4 KiB-granular: balloon-reported free pages get punched as
// holes and CH's SEEK_DATA/SEEK_HOLE snapshot writer (v52+) skips them.
// A nil Thp would silently re-enable THP and break sparse snapshots —
// rejecting "thp": null at the wire is preferable to a silent fallback.
Thp *bool `json:"thp"`
Prefault bool `json:"prefault,omitempty"`
HotplugSize uint64 `json:"hotplug_size,omitempty"`
HotplugMethod string `json:"hotplug_method,omitempty"`
}
type chDisk struct {
Path string `json:"path"`
Readonly bool `json:"readonly,omitempty"`
ImageType string `json:"image_type,omitempty"`
}
type chNet struct {
Tap string `json:"tap"`
MAC string `json:"mac"`
NumQs int `json:"num_queues,omitempty"`
QueueS int `json:"queue_size,omitempty"`
}
type chBalloon struct {
Size int64 `json:"size"`
DeflateOnOOM bool `json:"deflate_on_oom"`
FreePageRep bool `json:"free_page_reporting,omitempty"`
}
type chConsole struct {
Mode string `json:"mode"`
}
type chCreatePayload struct {
Payload chPayload `json:"payload"`
CPUs chCPUs `json:"cpus"`
Memory chMemory `json:"memory"`
Disks []chDisk `json:"disks"`
Net []chNet `json:"net"`
Balloon *chBalloon `json:"balloon,omitempty"`
Serial chConsole `json:"serial"`
Console chConsole `json:"console"`
}
// createVM sends the full VM configuration as a single payload.
func (c *chClient) createVM(ctx context.Context, cfg *VMConfig) error {
memBytes := uint64(cfg.MemoryMB) * 1024 * 1024
payload := chCreatePayload{
Payload: chPayload{
Kernel: cfg.KernelPath,
Cmdline: cfg.kernelArgs(),
},
CPUs: chCPUs{
BootVCPUs: cfg.VCPUs,
MaxVCPUs: cfg.VCPUs,
},
Memory: chMemory{
Size: memBytes,
Shared: true,
Thp: boolPtr(false),
},
Disks: []chDisk{
{
Path: cfg.SandboxDir + "/rootfs.ext4",
ImageType: "Raw",
},
},
Net: []chNet{
{
Tap: cfg.TapDevice,
MAC: cfg.TapMAC,
},
},
Balloon: &chBalloon{
Size: 0,
DeflateOnOOM: true,
FreePageRep: true,
},
Serial: chConsole{
Mode: "Tty",
},
Console: chConsole{
Mode: "Off",
},
}
return c.do(ctx, http.MethodPut, "/api/v1/vm.create", payload)
}
// bootVM starts the VM after creation.
func (c *chClient) bootVM(ctx context.Context) error {
return c.do(ctx, http.MethodPut, "/api/v1/vm.boot", nil)
}
// shutdownVMM cleanly shuts down the Cloud Hypervisor VMM process.
func (c *chClient) shutdownVMM(ctx context.Context) error {
return c.do(ctx, http.MethodPut, "/api/v1/vmm.shutdown", nil)
}
// resizeBalloon adjusts the balloon target at runtime.
// sizeBytes is memory to take FROM the guest (0 = give all back).
func (c *chClient) resizeBalloon(ctx context.Context, sizeBytes int64) error {
return c.do(ctx, http.MethodPut, "/api/v1/vm.resize", map[string]int64{
"desired_balloon": sizeBytes,
})
}
// pauseVM freezes guest vCPUs and devices via the CH API.
func (c *chClient) pauseVM(ctx context.Context) error {
return c.do(ctx, http.MethodPut, "/api/v1/vm.pause", nil)
}
// resumeVM unfreezes a paused VM via the CH API.
func (c *chClient) resumeVM(ctx context.Context) error {
return c.do(ctx, http.MethodPut, "/api/v1/vm.resume", nil)
}
// snapshotVM dumps VM config + state + memory to a directory URL of the form
// `file:///abs/path/`. VM must be paused before calling.
func (c *chClient) snapshotVM(ctx context.Context, destURL string) error {
return c.do(ctx, http.MethodPut, "/api/v1/vm.snapshot", map[string]string{
"destination_url": destURL,
})
}
// vmInfo reports the runtime state of the VM. Used after a restore to confirm
// CH successfully hydrated the snapshot before registering the VM.
func (c *chClient) vmInfo(ctx context.Context) (state string, err error) {
var resp struct {
State string `json:"state"`
}
if err := c.doJSON(ctx, http.MethodGet, "/api/v1/vm.info", nil, &resp); err != nil {
return "", err
}
return resp.State, nil
}

104
internal/vm/cleanup.go Normal file
View File

@ -0,0 +1,104 @@
package vm
import (
"log/slog"
"os"
"path/filepath"
"regexp"
"strconv"
"strings"
"syscall"
"time"
)
// CleanupStaleProcesses kills any cloud-hypervisor processes left behind by a
// previous agent that crashed without graceful shutdown. Must run at agent
// startup before devicemapper.CleanupStaleDevices — a still-running CH process
// holds the dm-snapshot open and would cause "Device or resource busy" on
// dmsetup remove.
//
// Matches processes by argv containing the wrenn CH API socket path
// (/tmp/ch-<sandboxID>.sock) so we don't kill unrelated cloud-hypervisor VMs
// the operator may be running.
//
// Also removes stale /tmp/ch-*.sock files once the owning process is gone.
func CleanupStaleProcesses() {
socketPattern := regexp.MustCompile(`/tmp/ch-[A-Za-z0-9-]+\.sock`)
pids, err := scanProcs()
if err != nil {
slog.Debug("scan procs failed", "error", err)
return
}
killed := 0
for _, pid := range pids {
cmdline, err := readCmdline(pid)
if err != nil {
continue
}
if !strings.Contains(cmdline, "cloud-hypervisor") {
continue
}
if !socketPattern.MatchString(cmdline) {
continue
}
slog.Warn("killing stale cloud-hypervisor process", "pid", pid, "cmdline", cmdline)
if err := syscall.Kill(pid, syscall.SIGTERM); err != nil {
slog.Warn("SIGTERM stale CH failed", "pid", pid, "error", err)
}
killed++
}
// Give SIGTERM'd processes a brief window to exit so subsequent dm/loop
// teardown sees no open fd, then SIGKILL anything still alive.
if killed > 0 {
time.Sleep(500 * time.Millisecond)
for _, pid := range pids {
cmdline, err := readCmdline(pid)
if err != nil {
continue
}
if !strings.Contains(cmdline, "cloud-hypervisor") || !socketPattern.MatchString(cmdline) {
continue
}
_ = syscall.Kill(pid, syscall.SIGKILL)
}
time.Sleep(200 * time.Millisecond)
}
matches, _ := filepath.Glob("/tmp/ch-*.sock")
for _, sock := range matches {
if err := os.Remove(sock); err == nil {
slog.Info("removed stale CH socket", "path", sock)
}
}
}
func scanProcs() ([]int, error) {
entries, err := os.ReadDir("/proc")
if err != nil {
return nil, err
}
var pids []int
for _, e := range entries {
if !e.IsDir() {
continue
}
pid, err := strconv.Atoi(e.Name())
if err != nil {
continue
}
pids = append(pids, pid)
}
return pids, nil
}
func readCmdline(pid int) (string, error) {
b, err := os.ReadFile("/proc/" + strconv.Itoa(pid) + "/cmdline")
if err != nil {
return "", err
}
// /proc/<pid>/cmdline is NUL-separated; convert to spaces for substring match.
return strings.ReplaceAll(string(b), "\x00", " "), nil
}

View File

@ -2,13 +2,25 @@ package vm
import "fmt"
// VMConfig holds the configuration for creating a Firecracker microVM.
// SandboxTmpDir returns the per-sandbox tmpfs mount point used inside the
// VMM's private mount namespace. Recorded as the disk path in CH's saved
// config.json, so restore paths must reconstruct it exactly to make the
// symlink prelude resolve.
func SandboxTmpDir(sandboxID string) string {
return fmt.Sprintf("/tmp/ch-vm-%s", sandboxID)
}
// SandboxSocketPath returns the Cloud Hypervisor API socket path for a sandbox.
func SandboxSocketPath(sandboxID string) string {
return fmt.Sprintf("/tmp/ch-%s.sock", sandboxID)
}
// VMConfig holds the configuration for creating a Cloud Hypervisor microVM.
type VMConfig struct {
// SandboxID is the unique identifier for this sandbox (e.g., "cl-a1b2c3d4").
SandboxID string
// TemplateID is the template UUID string used to populate MMDS metadata
// so that envd can read WRENN_TEMPLATE_ID from inside the guest.
// TemplateID is the template UUID string, passed to envd via PostInit.
TemplateID string
// KernelPath is the path to the uncompressed Linux kernel (vmlinux).
@ -25,12 +37,12 @@ type VMConfig struct {
MemoryMB int
// NetworkNamespace is the name of the network namespace to launch
// Firecracker inside (e.g., "ns-1"). The namespace must already exist
// Cloud Hypervisor inside (e.g., "ns-1"). The namespace must already exist
// with a TAP device configured.
NetworkNamespace string
// TapDevice is the name of the TAP device inside the network namespace
// that Firecracker will attach to (e.g., "tap0").
// that Cloud Hypervisor will attach to (e.g., "tap0").
TapDevice string
// TapMAC is the MAC address for the TAP device.
@ -45,19 +57,34 @@ type VMConfig struct {
// NetMask is the subnet mask for the guest network (e.g., "255.255.255.252").
NetMask string
// FirecrackerBin is the path to the firecracker binary.
FirecrackerBin string
// VMMBin is the path to the cloud-hypervisor binary.
VMMBin string
// SocketPath is the path for the Firecracker API Unix socket.
// SocketPath is the path for the Cloud Hypervisor API Unix socket.
SocketPath string
// SandboxDir is the tmpfs mount point for per-sandbox files inside the
// mount namespace (e.g., "/fc-vm").
// mount namespace (e.g., "/ch-vm").
SandboxDir string
// InitPath is the path to the init process inside the guest.
// Defaults to "/sbin/init" if empty.
InitPath string
// RestoreFromDir, if non-empty, switches the process launcher into restore
// mode. CH is invoked with `--restore source_url=file://{dir}/` instead of
// the fresh-boot path. The directory must contain CH's snapshot artefacts
// (config.json, state.json, memory-ranges, memory file).
RestoreFromDir string
// RestoreLazyMemory enables `memory_restore_mode=ondemand` so guest pages
// fault in lazily via userfaultfd. Only honored when RestoreFromDir is set.
RestoreLazyMemory bool
// LogDir is the directory for Cloud Hypervisor log files. If set, CH
// stdout/stderr are written to {LogDir}/ch-{SandboxID}.log instead of
// the parent process's stdout/stderr.
LogDir string
}
func (c *VMConfig) applyDefaults() {
@ -67,14 +94,14 @@ func (c *VMConfig) applyDefaults() {
if c.MemoryMB == 0 {
c.MemoryMB = 512
}
if c.FirecrackerBin == "" {
c.FirecrackerBin = "/usr/local/bin/firecracker"
if c.VMMBin == "" {
c.VMMBin = "/usr/local/bin/cloud-hypervisor"
}
if c.SocketPath == "" {
c.SocketPath = fmt.Sprintf("/tmp/fc-%s.sock", c.SandboxID)
c.SocketPath = SandboxSocketPath(c.SandboxID)
}
if c.SandboxDir == "" {
c.SandboxDir = "/tmp/fc-vm"
c.SandboxDir = SandboxTmpDir(c.SandboxID)
}
if c.TapDevice == "" {
c.TapDevice = "tap0"
@ -95,7 +122,7 @@ func (c *VMConfig) kernelArgs() string {
)
return fmt.Sprintf(
"console=ttyS0 reboot=k panic=1 pci=off quiet loglevel=1 clocksource=kvm-clock init=%s %s",
"console=ttyS0 root=/dev/vda rw rootflags=nodiscard reboot=k panic=1 quiet loglevel=1 init_on_free=1 clocksource=kvm-clock init=%s %s",
c.InitPath, ipArg,
)
}

View File

@ -1,202 +0,0 @@
package vm
import (
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"net"
"net/http"
)
// fcClient talks to the Firecracker HTTP API over a Unix socket.
type fcClient struct {
http *http.Client
socketPath string
}
func newFCClient(socketPath string) *fcClient {
return &fcClient{
socketPath: socketPath,
http: &http.Client{
Transport: &http.Transport{
DialContext: func(ctx context.Context, _, _ string) (net.Conn, error) {
var d net.Dialer
return d.DialContext(ctx, "unix", socketPath)
},
},
// No global timeout — callers pass context.Context with appropriate
// deadlines. A fixed 10s timeout was too short for snapshot/resume
// operations on large-memory VMs (20GB+ memfiles).
},
}
}
func (c *fcClient) do(ctx context.Context, method, path string, body any) error {
var bodyReader io.Reader
if body != nil {
data, err := json.Marshal(body)
if err != nil {
return fmt.Errorf("marshal request body: %w", err)
}
bodyReader = bytes.NewReader(data)
}
// The host in the URL is ignored for Unix sockets; we use "localhost" by convention.
req, err := http.NewRequestWithContext(ctx, method, "http://localhost"+path, bodyReader)
if err != nil {
return fmt.Errorf("create request: %w", err)
}
if body != nil {
req.Header.Set("Content-Type", "application/json")
}
resp, err := c.http.Do(req)
if err != nil {
return fmt.Errorf("%s %s: %w", method, path, err)
}
defer resp.Body.Close()
if resp.StatusCode >= 300 {
respBody, _ := io.ReadAll(resp.Body)
return fmt.Errorf("%s %s: status %d: %s", method, path, resp.StatusCode, string(respBody))
}
return nil
}
// setBootSource configures the kernel and boot args.
func (c *fcClient) setBootSource(ctx context.Context, kernelPath, bootArgs string) error {
return c.do(ctx, http.MethodPut, "/boot-source", map[string]string{
"kernel_image_path": kernelPath,
"boot_args": bootArgs,
})
}
// setRootfsDrive configures the root filesystem drive.
func (c *fcClient) setRootfsDrive(ctx context.Context, driveID, path string, readOnly bool) error {
return c.do(ctx, http.MethodPut, "/drives/"+driveID, map[string]any{
"drive_id": driveID,
"path_on_host": path,
"is_root_device": true,
"is_read_only": readOnly,
})
}
// setNetworkInterface configures a network interface attached to a TAP device.
// A tx_rate_limiter caps sustained guest→host throughput to prevent user
// application traffic from completely saturating the TAP device and starving
// envd control traffic (PTY, exec, file ops).
func (c *fcClient) setNetworkInterface(ctx context.Context, ifaceID, tapName, macAddr string) error {
return c.do(ctx, http.MethodPut, "/network-interfaces/"+ifaceID, map[string]any{
"iface_id": ifaceID,
"host_dev_name": tapName,
"guest_mac": macAddr,
"tx_rate_limiter": map[string]any{
"bandwidth": map[string]any{
"size": 209715200, // 200 MB/s sustained
"refill_time": 1000, // refill period: 1 second
"one_time_burst": 104857600, // 100 MB initial burst
},
},
})
}
// setMachineConfig configures vCPUs, memory, and other machine settings.
func (c *fcClient) setMachineConfig(ctx context.Context, vcpus, memMB int) error {
return c.do(ctx, http.MethodPut, "/machine-config", map[string]any{
"vcpu_count": vcpus,
"mem_size_mib": memMB,
"smt": false,
})
}
// setMMDSConfig enables MMDS V2 token-based access on the given network interface.
// Must be called before startVM.
func (c *fcClient) setMMDSConfig(ctx context.Context, ifaceID string) error {
return c.do(ctx, http.MethodPut, "/mmds/config", map[string]any{
"version": "V2",
"network_interfaces": []string{ifaceID},
})
}
// mmdsMetadata is the metadata payload written to the Firecracker MMDS store.
// envd reads this via PollForMMDSOpts to populate WRENN_SANDBOX_ID and WRENN_TEMPLATE_ID.
type mmdsMetadata struct {
SandboxID string `json:"instanceID"`
TemplateID string `json:"envID"`
}
// setMMDS writes sandbox metadata to the Firecracker MMDS store.
// Can be called after the VM has started.
func (c *fcClient) setMMDS(ctx context.Context, sandboxID, templateID string) error {
return c.do(ctx, http.MethodPut, "/mmds", mmdsMetadata{
SandboxID: sandboxID,
TemplateID: templateID,
})
}
// setBalloon configures the Firecracker balloon device for dynamic memory
// management. deflateOnOom lets the guest reclaim balloon pages under memory
// pressure. statsInterval enables periodic stats via GET /balloon/statistics.
// Must be called before startVM.
func (c *fcClient) setBalloon(ctx context.Context, amountMiB int, deflateOnOom bool, statsIntervalS int) error {
return c.do(ctx, http.MethodPut, "/balloon", map[string]any{
"amount_mib": amountMiB,
"deflate_on_oom": deflateOnOom,
"stats_polling_interval_s": statsIntervalS,
})
}
// updateBalloon adjusts the balloon target at runtime.
func (c *fcClient) updateBalloon(ctx context.Context, amountMiB int) error {
return c.do(ctx, http.MethodPatch, "/balloon", map[string]any{
"amount_mib": amountMiB,
})
}
// startVM issues the InstanceStart action.
func (c *fcClient) startVM(ctx context.Context) error {
return c.do(ctx, http.MethodPut, "/actions", map[string]string{
"action_type": "InstanceStart",
})
}
// pauseVM pauses the microVM.
func (c *fcClient) pauseVM(ctx context.Context) error {
return c.do(ctx, http.MethodPatch, "/vm", map[string]string{
"state": "Paused",
})
}
// resumeVM resumes a paused microVM.
func (c *fcClient) resumeVM(ctx context.Context) error {
return c.do(ctx, http.MethodPatch, "/vm", map[string]string{
"state": "Resumed",
})
}
// createSnapshot creates a VM snapshot.
// snapshotType is "Full" (all memory) or "Diff" (only dirty pages since last resume).
func (c *fcClient) createSnapshot(ctx context.Context, snapPath, memPath, snapshotType string) error {
return c.do(ctx, http.MethodPut, "/snapshot/create", map[string]any{
"snapshot_type": snapshotType,
"snapshot_path": snapPath,
"mem_file_path": memPath,
})
}
// loadSnapshotWithUffd loads a VM snapshot using a UFFD socket for
// lazy memory loading. Firecracker will connect to the socket and
// send the uffd fd + memory region mappings.
func (c *fcClient) loadSnapshotWithUffd(ctx context.Context, snapPath, uffdSocketPath string) error {
return c.do(ctx, http.MethodPut, "/snapshot/load", map[string]any{
"snapshot_path": snapPath,
"resume_vm": false,
"mem_backend": map[string]any{
"backend_type": "Uffd",
"backend_path": uffdSocketPath,
},
})
}

View File

@ -1,128 +0,0 @@
package vm
import (
"context"
"fmt"
"log/slog"
"os"
"os/exec"
"syscall"
"time"
)
// process represents a running Firecracker process with mount and network
// namespace isolation.
type process struct {
cmd *exec.Cmd
cancel context.CancelFunc
exitCh chan struct{}
exitErr error
}
// startProcess launches the Firecracker binary inside an isolated mount namespace
// and the specified network namespace. The launch sequence:
//
// 1. unshare -m: creates a private mount namespace
// 2. mount --make-rprivate /: prevents mount propagation to host
// 3. mount tmpfs at SandboxDir: ephemeral workspace for this VM
// 4. symlink kernel and rootfs into SandboxDir
// 5. ip netns exec <ns>: enters the network namespace where TAP is configured
// 6. exec firecracker with the API socket path
func startProcess(ctx context.Context, cfg *VMConfig) (*process, error) {
// Use a background context for the long-lived Firecracker process.
// The request context (ctx) is only used for the startup phase — we must
// not tie the VM's lifetime to the HTTP request that created it.
execCtx, cancel := context.WithCancel(context.Background())
script := buildStartScript(cfg)
cmd := exec.CommandContext(execCtx, "unshare", "-m", "--", "bash", "-c", script)
cmd.SysProcAttr = &syscall.SysProcAttr{
Setsid: true, // new session so signals don't propagate from parent
}
cmd.Stdout = os.Stdout
cmd.Stderr = os.Stderr
if err := cmd.Start(); err != nil {
cancel()
return nil, fmt.Errorf("start firecracker process: %w", err)
}
p := &process{
cmd: cmd,
cancel: cancel,
exitCh: make(chan struct{}),
}
go func() {
p.exitErr = cmd.Wait()
close(p.exitCh)
}()
slog.Info("firecracker process started",
"pid", cmd.Process.Pid,
"sandbox", cfg.SandboxID,
)
return p, nil
}
// buildStartScript generates the bash script that sets up the mount namespace,
// symlinks kernel/rootfs, and execs Firecracker inside the network namespace.
func buildStartScript(cfg *VMConfig) string {
return fmt.Sprintf(`
set -euo pipefail
# Prevent mount propagation to the host
mount --make-rprivate /
# Create ephemeral tmpfs workspace
mkdir -p %[1]s
mount -t tmpfs tmpfs %[1]s
# Symlink kernel and rootfs into the workspace
ln -s %[2]s %[1]s/vmlinux
ln -s %[3]s %[1]s/rootfs.ext4
# Launch Firecracker inside the network namespace
exec ip netns exec %[4]s %[5]s --api-sock %[6]s
`,
cfg.SandboxDir, // 1
cfg.KernelPath, // 2
cfg.RootfsPath, // 3
cfg.NetworkNamespace, // 4
cfg.FirecrackerBin, // 5
cfg.SocketPath, // 6
)
}
// stop sends SIGTERM and waits for the process to exit. If it doesn't exit
// within 10 seconds, SIGKILL is sent.
func (p *process) stop() error {
if p.cmd.Process == nil {
return nil
}
// Send SIGTERM to the process group (negative PID).
if err := syscall.Kill(-p.cmd.Process.Pid, syscall.SIGTERM); err != nil {
slog.Debug("sigterm failed, process may have exited", "error", err)
}
select {
case <-p.exitCh:
return nil
case <-time.After(10 * time.Second):
slog.Warn("firecracker did not exit after SIGTERM, sending SIGKILL")
if err := syscall.Kill(-p.cmd.Process.Pid, syscall.SIGKILL); err != nil {
slog.Debug("sigkill failed", "error", err)
}
<-p.exitCh
return nil
}
}
// exited returns a channel that is closed when the process exits.
func (p *process) exited() <-chan struct{} {
return p.exitCh
}

View File

@ -5,18 +5,19 @@ import (
"fmt"
"log/slog"
"os"
"strings"
"sync"
"time"
)
// VM represents a running Firecracker microVM.
// VM represents a running Cloud Hypervisor microVM.
type VM struct {
Config VMConfig
process *process
client *fcClient
client *chClient
}
// Manager handles the lifecycle of Firecracker microVMs.
// Manager handles the lifecycle of Cloud Hypervisor microVMs.
type Manager struct {
mu sync.RWMutex
// vms tracks running VMs by sandbox ID.
@ -30,7 +31,7 @@ func NewManager() *Manager {
}
}
// Create boots a new Firecracker microVM with the given configuration.
// Create boots a new Cloud Hypervisor microVM with the given configuration.
// The network namespace and TAP device must already be set up.
func (m *Manager) Create(ctx context.Context, cfg VMConfig) (*VM, error) {
cfg.applyDefaults()
@ -38,7 +39,6 @@ func (m *Manager) Create(ctx context.Context, cfg VMConfig) (*VM, error) {
return nil, fmt.Errorf("invalid config: %w", err)
}
// Clean up any leftover socket from a previous run.
os.Remove(cfg.SocketPath)
slog.Info("creating VM",
@ -47,8 +47,8 @@ func (m *Manager) Create(ctx context.Context, cfg VMConfig) (*VM, error) {
"memory_mb", cfg.MemoryMB,
)
// Step 1: Launch the Firecracker process.
proc, err := startProcess(ctx, &cfg)
// Step 1: Launch the Cloud Hypervisor process.
proc, err := startProcess(&cfg)
if err != nil {
return nil, fmt.Errorf("start process: %w", err)
}
@ -59,25 +59,18 @@ func (m *Manager) Create(ctx context.Context, cfg VMConfig) (*VM, error) {
return nil, fmt.Errorf("wait for socket: %w", err)
}
// Step 3: Configure the VM via the Firecracker API.
client := newFCClient(cfg.SocketPath)
// Step 3: Configure and boot the VM via a single API call.
client := newCHClient(cfg.SocketPath)
if err := configureVM(ctx, client, &cfg); err != nil {
if err := client.createVM(ctx, &cfg); err != nil {
_ = proc.stop()
return nil, fmt.Errorf("configure VM: %w", err)
return nil, fmt.Errorf("create VM config: %w", err)
}
// Step 4: Start the VM.
if err := client.startVM(ctx); err != nil {
// Step 4: Boot the VM.
if err := client.bootVM(ctx); err != nil {
_ = proc.stop()
return nil, fmt.Errorf("start VM: %w", err)
}
// Step 5: Push sandbox metadata into MMDS so envd can read
// WRENN_SANDBOX_ID and WRENN_TEMPLATE_ID from inside the guest.
if err := client.setMMDS(ctx, cfg.SandboxID, cfg.TemplateID); err != nil {
_ = proc.stop()
return nil, fmt.Errorf("set MMDS metadata: %w", err)
return nil, fmt.Errorf("boot VM: %w", err)
}
vm := &VM{
@ -95,78 +88,34 @@ func (m *Manager) Create(ctx context.Context, cfg VMConfig) (*VM, error) {
return vm, nil
}
// configureVM sends the configuration to Firecracker via its HTTP API.
func configureVM(ctx context.Context, client *fcClient, cfg *VMConfig) error {
// Boot source (kernel + args)
if err := client.setBootSource(ctx, cfg.KernelPath, cfg.kernelArgs()); err != nil {
return fmt.Errorf("set boot source: %w", err)
}
// Root drive — use the symlink path inside the mount namespace so that
// snapshots record a stable path that works on restore.
rootfsSymlink := cfg.SandboxDir + "/rootfs.ext4"
if err := client.setRootfsDrive(ctx, "rootfs", rootfsSymlink, false); err != nil {
return fmt.Errorf("set rootfs drive: %w", err)
}
// Network interface
if err := client.setNetworkInterface(ctx, "eth0", cfg.TapDevice, cfg.TapMAC); err != nil {
return fmt.Errorf("set network interface: %w", err)
}
// Machine config (vCPUs + memory)
if err := client.setMachineConfig(ctx, cfg.VCPUs, cfg.MemoryMB); err != nil {
return fmt.Errorf("set machine config: %w", err)
}
// Balloon device — allows the host to reclaim unused guest memory.
// Start with 0 (no inflation). deflate_on_oom lets the guest reclaim
// balloon pages under memory pressure. Stats interval enables monitoring.
if err := client.setBalloon(ctx, 0, true, 5); err != nil {
slog.Warn("set balloon failed (non-fatal, VM will run without memory reclaim)", "error", err)
}
// MMDS config — enable V2 token access on eth0 so that envd can read
// WRENN_SANDBOX_ID and WRENN_TEMPLATE_ID from inside the guest.
if err := client.setMMDSConfig(ctx, "eth0"); err != nil {
return fmt.Errorf("set MMDS config: %w", err)
}
return nil
}
// Pause pauses a running VM.
// Pause freezes a running VM's vCPUs via the CH API.
func (m *Manager) Pause(ctx context.Context, sandboxID string) error {
m.mu.RLock()
vm, ok := m.vms[sandboxID]
m.mu.RUnlock()
vm, ok := m.Get(sandboxID)
if !ok {
return fmt.Errorf("VM not found: %s", sandboxID)
}
if err := vm.client.pauseVM(ctx); err != nil {
return fmt.Errorf("pause VM: %w", err)
}
slog.Info("VM paused", "sandbox", sandboxID)
return nil
return vm.client.pauseVM(ctx)
}
// Resume resumes a paused VM.
// Resume unfreezes a paused VM via the CH API.
func (m *Manager) Resume(ctx context.Context, sandboxID string) error {
m.mu.RLock()
vm, ok := m.vms[sandboxID]
m.mu.RUnlock()
vm, ok := m.Get(sandboxID)
if !ok {
return fmt.Errorf("VM not found: %s", sandboxID)
}
return vm.client.resumeVM(ctx)
}
if err := vm.client.resumeVM(ctx); err != nil {
return fmt.Errorf("resume VM: %w", err)
// Info returns the CH VM state (e.g. "Running", "Paused", "Shutdown") via
// the CH unix-socket API. Returns an error if the socket is dead or the VM
// is not registered. Use to probe liveness before issuing destructive ops
// like pause or snapshot.
func (m *Manager) Info(ctx context.Context, sandboxID string) (string, error) {
vm, ok := m.Get(sandboxID)
if !ok {
return "", fmt.Errorf("VM not found: %s", sandboxID)
}
slog.Info("VM resumed", "sandbox", sandboxID)
return nil
return vm.client.vmInfo(ctx)
}
// UpdateBalloon adjusts the balloon target for a running VM.
@ -179,7 +128,8 @@ func (m *Manager) UpdateBalloon(ctx context.Context, sandboxID string, amountMiB
return fmt.Errorf("VM not found: %s", sandboxID)
}
return vm.client.updateBalloon(ctx, amountMiB)
sizeBytes := int64(amountMiB) * 1024 * 1024
return vm.client.resizeBalloon(ctx, sizeBytes)
}
// Destroy stops and cleans up a VM.
@ -190,103 +140,98 @@ func (m *Manager) Destroy(ctx context.Context, sandboxID string) error {
m.mu.Unlock()
return fmt.Errorf("VM not found: %s", sandboxID)
}
delete(m.vms, sandboxID)
m.mu.Unlock()
slog.Info("destroying VM", "sandbox", sandboxID)
// Stop the Firecracker process.
// Try clean shutdown first, fall back to process kill.
shutdownCtx, shutdownCancel := context.WithTimeout(ctx, 5*time.Second)
if err := vm.client.shutdownVMM(shutdownCtx); err != nil {
slog.Debug("clean VMM shutdown failed, killing process", "sandbox", sandboxID, "error", err)
}
shutdownCancel()
if err := vm.process.stop(); err != nil {
slog.Warn("error stopping process", "sandbox", sandboxID, "error", err)
}
// Clean up the API socket.
os.Remove(vm.Config.SocketPath)
m.mu.Lock()
delete(m.vms, sandboxID)
m.mu.Unlock()
slog.Info("VM destroyed", "sandbox", sandboxID)
return nil
}
// Snapshot creates a VM snapshot. The VM must already be paused.
// snapshotType is "Full" (all memory) or "Diff" (only dirty pages since last resume).
func (m *Manager) Snapshot(ctx context.Context, sandboxID, snapPath, memPath, snapshotType string) error {
m.mu.RLock()
vm, ok := m.vms[sandboxID]
m.mu.RUnlock()
// Snapshot writes the VM's config/state/memory to snapshotDir via CH's
// vm.snapshot API. The VM must already be paused. snapshotDir must be an
// absolute path; it is passed to CH as `file://{dir}/`.
func (m *Manager) Snapshot(ctx context.Context, sandboxID, snapshotDir string) error {
vm, ok := m.Get(sandboxID)
if !ok {
return fmt.Errorf("VM not found: %s", sandboxID)
}
if err := vm.client.createSnapshot(ctx, snapPath, memPath, snapshotType); err != nil {
return fmt.Errorf("create snapshot: %w", err)
if err := os.MkdirAll(snapshotDir, 0o755); err != nil {
return fmt.Errorf("mkdir snapshot dir: %w", err)
}
slog.Info("VM snapshot created", "sandbox", sandboxID, "snap_path", snapPath, "type", snapshotType)
url := "file://" + strings.TrimRight(snapshotDir, "/") + "/"
if err := vm.client.snapshotVM(ctx, url); err != nil {
return fmt.Errorf("vm.snapshot: %w", err)
}
slog.Info("VM snapshot written", "sandbox", sandboxID, "dir", snapshotDir)
return nil
}
// CreateFromSnapshot boots a new Firecracker VM by loading a snapshot
// using UFFD for lazy memory loading. The network namespace and TAP
// device must already be set up.
// CreateFromSnapshot launches a Cloud Hypervisor process in restore mode,
// connecting it to an existing snapshot directory. The VM is left in the
// paused state — the caller is expected to call Resume after any post-restore
// setup (e.g. re-acquiring envd connectivity is implicit via TCP).
//
// No boot resources (kernel, drives, machine config) are configured —
// the snapshot carries all that state. The rootfs path recorded in the
// snapshot is resolved via a stable symlink at SandboxDir/rootfs.ext4
// inside the mount namespace (created by the start script in jailer.go).
//
// The sequence is:
// 1. Start FC process in mount+network namespace (creates tmpfs + rootfs symlink)
// 2. Wait for API socket
// 3. Load snapshot with UFFD backend
// 4. Resume VM execution
func (m *Manager) CreateFromSnapshot(ctx context.Context, cfg VMConfig, snapPath, uffdSocketPath string) (*VM, error) {
// cfg.RestoreFromDir must point to an absolute path containing the CH
// snapshot artefacts. The disk path inside config.json must already resolve
// (CH receives the same SandboxDir/rootfs.ext4 symlink as for fresh boot).
func (m *Manager) CreateFromSnapshot(ctx context.Context, cfg VMConfig) (*VM, error) {
cfg.applyDefaults()
if err := cfg.validate(); err != nil {
return nil, fmt.Errorf("invalid config: %w", err)
}
if cfg.RestoreFromDir == "" {
return nil, fmt.Errorf("RestoreFromDir is required for restore")
}
os.Remove(cfg.SocketPath)
slog.Info("restoring VM from snapshot",
"sandbox", cfg.SandboxID,
"snap_path", snapPath,
"restore_dir", cfg.RestoreFromDir,
"lazy_memory", cfg.RestoreLazyMemory,
)
// Step 1: Launch the Firecracker process.
// The start script creates a tmpfs at SandboxDir and symlinks
// rootfs.ext4 → cfg.RootfsPath, so the snapshot's recorded rootfs
// path (/fc-vm/rootfs.ext4) resolves to the new clone.
proc, err := startProcess(ctx, &cfg)
proc, err := startRestoreProcess(&cfg)
if err != nil {
return nil, fmt.Errorf("start process: %w", err)
return nil, fmt.Errorf("start restore process: %w", err)
}
// Step 2: Wait for the API socket.
if err := waitForSocket(ctx, cfg.SocketPath, proc); err != nil {
_ = proc.stop()
return nil, fmt.Errorf("wait for socket: %w", err)
}
client := newFCClient(cfg.SocketPath)
client := newCHClient(cfg.SocketPath)
// Step 3: Load the snapshot with UFFD backend.
// No boot resources are configured — the snapshot carries kernel,
// drive, network, and machine config state.
if err := client.loadSnapshotWithUffd(ctx, snapPath, uffdSocketPath); err != nil {
// Confirm CH actually hydrated the snapshot before registering. Without
// this check, a broken snapshot would leave a zombie *VM in the map that
// blocks future restores for the same sandbox ID.
state, err := client.vmInfo(ctx)
if err != nil {
_ = proc.stop()
return nil, fmt.Errorf("load snapshot: %w", err)
return nil, fmt.Errorf("vm.info after restore: %w", err)
}
// Step 4: Resume the VM.
if err := client.resumeVM(ctx); err != nil {
if state != "Paused" {
_ = proc.stop()
return nil, fmt.Errorf("resume VM: %w", err)
}
// Step 5: Push sandbox metadata into MMDS.
if err := client.setMMDS(ctx, cfg.SandboxID, cfg.TemplateID); err != nil {
_ = proc.stop()
return nil, fmt.Errorf("set MMDS metadata: %w", err)
return nil, fmt.Errorf("unexpected post-restore VM state %q (want Paused)", state)
}
vm := &VM{
@ -299,16 +244,20 @@ func (m *Manager) CreateFromSnapshot(ctx context.Context, cfg VMConfig, snapPath
m.vms[cfg.SandboxID] = vm
m.mu.Unlock()
slog.Info("VM restored from snapshot", "sandbox", cfg.SandboxID)
slog.Info("VM restored from snapshot (paused)", "sandbox", cfg.SandboxID)
return vm, nil
}
// PID returns the process ID of the unshare wrapper process.
// The actual Firecracker process is a direct child of this PID.
func (v *VM) PID() int {
return v.process.cmd.Process.Pid
}
// Exited returns a channel that is closed when the VM process exits.
func (v *VM) Exited() <-chan struct{} {
return v.process.exited()
}
// Get returns a running VM by sandbox ID.
func (m *Manager) Get(sandboxID string) (*VM, bool) {
m.mu.RLock()
@ -317,7 +266,7 @@ func (m *Manager) Get(sandboxID string) (*VM, bool) {
return vm, ok
}
// waitForSocket polls for the Firecracker API socket to appear on disk.
// waitForSocket polls for the Cloud Hypervisor API socket to appear on disk.
func waitForSocket(ctx context.Context, socketPath string, proc *process) error {
ticker := time.NewTicker(10 * time.Millisecond)
defer ticker.Stop()
@ -329,7 +278,7 @@ func waitForSocket(ctx context.Context, socketPath string, proc *process) error
case <-ctx.Done():
return ctx.Err()
case <-proc.exited():
return fmt.Errorf("firecracker process exited before socket was ready")
return fmt.Errorf("cloud-hypervisor process exited before socket was ready")
case <-timeout:
return fmt.Errorf("timed out waiting for API socket at %s", socketPath)
case <-ticker.C:

174
internal/vm/process.go Normal file
View File

@ -0,0 +1,174 @@
package vm
import (
"context"
"fmt"
"log/slog"
"os"
"os/exec"
"strings"
"syscall"
"time"
)
// process represents a running Cloud Hypervisor process with mount and network
// namespace isolation.
type process struct {
cmd *exec.Cmd
cancel context.CancelFunc
exitCh chan struct{}
exitErr error
logFile *os.File
}
// startProcess launches the Cloud Hypervisor binary inside an isolated mount
// namespace and the specified network namespace. Used for fresh boot (no
// snapshot). The launch sequence:
//
// 1. unshare -m: creates a private mount namespace
// 2. mount --make-rprivate /: prevents mount propagation to host
// 3. mount tmpfs at SandboxDir: ephemeral workspace for this VM
// 4. symlink kernel and rootfs into SandboxDir
// 5. ip netns exec <ns>: enters the network namespace where TAP is configured
// 6. exec cloud-hypervisor with the API socket path
func startProcess(cfg *VMConfig) (*process, error) {
script := buildStartScript(cfg)
return launchScript(script, cfg)
}
// startRestoreProcess launches CH in restore mode. It mirrors startProcess
// for namespace/tmpfs/symlink setup so the disk paths recorded in the
// snapshot's config.json remain valid, then execs CH with `--restore`.
func startRestoreProcess(cfg *VMConfig) (*process, error) {
script := buildRestoreScript(cfg)
return launchScript(script, cfg)
}
func launchScript(script string, cfg *VMConfig) (*process, error) {
execCtx, cancel := context.WithCancel(context.Background())
cmd := exec.CommandContext(execCtx, "unshare", "-m", "--", "bash", "-c", script)
cmd.SysProcAttr = &syscall.SysProcAttr{
Setsid: true,
}
var logFile *os.File
if cfg.LogDir != "" {
logPath := fmt.Sprintf("%s/ch-%s.log", cfg.LogDir, cfg.SandboxID)
f, err := os.OpenFile(logPath, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0640)
if err != nil {
cancel()
return nil, fmt.Errorf("open CH log file %s: %w", logPath, err)
}
cmd.Stdout = f
cmd.Stderr = f
logFile = f
}
if err := cmd.Start(); err != nil {
cancel()
if logFile != nil {
logFile.Close()
}
return nil, fmt.Errorf("start cloud-hypervisor process: %w", err)
}
p := &process{
cmd: cmd,
cancel: cancel,
exitCh: make(chan struct{}),
logFile: logFile,
}
go func() {
p.exitErr = cmd.Wait()
if p.logFile != nil {
p.logFile.Close()
}
close(p.exitCh)
}()
slog.Info("cloud-hypervisor process started",
"pid", cmd.Process.Pid,
"sandbox", cfg.SandboxID,
)
return p, nil
}
// buildStartScript generates the bash script for fresh boot: sets up mount
// namespace, symlinks kernel/rootfs, and execs Cloud Hypervisor.
func buildStartScript(cfg *VMConfig) string {
return buildLaunchScript(cfg, "")
}
// buildRestoreScript generates the bash script for restoring a VM from a
// snapshot directory. The mount/symlink prelude is identical to fresh boot
// so disk paths in the snapshot config.json resolve correctly.
func buildRestoreScript(cfg *VMConfig) string {
dir := strings.TrimRight(cfg.RestoreFromDir, "/")
restoreArg := fmt.Sprintf("--restore source_url=file://%s/", dir)
if cfg.RestoreLazyMemory {
restoreArg += ",memory_restore_mode=ondemand"
}
return buildLaunchScript(cfg, restoreArg)
}
// buildLaunchScript composes the namespace/tmpfs/symlink prelude and the
// final cloud-hypervisor exec line. extraArgs is appended verbatim — used
// to inject `--restore source_url=...` for restore launches.
func buildLaunchScript(cfg *VMConfig, extraArgs string) string {
chCmd := fmt.Sprintf("ip netns exec %s %s --api-socket path=%s",
cfg.NetworkNamespace, cfg.VMMBin, cfg.SocketPath)
if extraArgs != "" {
chCmd += " " + extraArgs
}
return fmt.Sprintf(`
set -euo pipefail
mount --make-rprivate /
mkdir -p %[1]s
mount -t tmpfs tmpfs %[1]s
ln -s %[2]s %[1]s/vmlinux
ln -s %[3]s %[1]s/rootfs.ext4
exec %[4]s
`,
cfg.SandboxDir, // 1
cfg.KernelPath, // 2
cfg.RootfsPath, // 3
chCmd, // 4
)
}
// stop sends SIGTERM and waits for the process to exit. If it doesn't exit
// within 10 seconds, SIGKILL is sent.
func (p *process) stop() error {
if p.cmd.Process == nil {
return nil
}
if err := syscall.Kill(-p.cmd.Process.Pid, syscall.SIGTERM); err != nil {
slog.Debug("sigterm failed, process may have exited", "error", err)
}
select {
case <-p.exitCh:
return nil
case <-time.After(10 * time.Second):
slog.Warn("cloud-hypervisor did not exit after SIGTERM, sending SIGKILL")
if err := syscall.Kill(-p.cmd.Process.Pid, syscall.SIGKILL); err != nil {
slog.Debug("sigkill failed", "error", err)
}
<-p.exitCh
return nil
}
}
// exited returns a channel that is closed when the process exits.
func (p *process) exited() <-chan struct{} {
return p.exitCh
}