forked from wrenn/wrenn
Co-authored-by: Tasnim Kabir Sadik <tksadik@omukk.dev> Reviewed-on: wrenn/wrenn#50
119 lines
4.7 KiB
Go
119 lines
4.7 KiB
Go
// Package sandbox: shared CH-restore helpers used by both Resume (paused →
|
|
// running) and the snapshot-template launch path (template → fresh sandbox).
|
|
//
|
|
// The two callers diverge in how they acquire resources (slot, dm-snapshot,
|
|
// sandbox identity) but converge on:
|
|
//
|
|
// build VMConfig → CreateFromSnapshot → vm.Resume → wait envd → balloon deflate
|
|
//
|
|
// These steps are extracted here so the sequence — and its quirks (paused
|
|
// post-restore state, balloon best-effort, restored disk path baked into
|
|
// CH's config.json) — has a single source of truth.
|
|
package sandbox
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"log/slog"
|
|
"path/filepath"
|
|
|
|
"git.omukk.dev/wrenn/wrenn/internal/envdclient"
|
|
"git.omukk.dev/wrenn/wrenn/internal/network"
|
|
"git.omukk.dev/wrenn/wrenn/internal/vm"
|
|
)
|
|
|
|
// restoreInputs is the common set of fields needed to build a restore VMConfig.
|
|
type restoreInputs struct {
|
|
sandboxID string // VM identity for the new CH process (sock path, log file)
|
|
templateID string // forwarded to envd via PostInit (informational)
|
|
snapDir string // directory containing CH snapshot artefacts
|
|
rootfsPath string // /dev/mapper/wrenn-{newID} — per-sandbox dm-snapshot
|
|
vcpus int
|
|
memoryMB int
|
|
slot *network.Slot
|
|
sandboxDir string // override for VMConfig.SandboxDir; "" = default
|
|
}
|
|
|
|
// buildRestoreVMConfig assembles the VMConfig used to launch a CH process in
|
|
// restore mode. sandboxDir, when non-empty, overrides the default
|
|
// "/tmp/ch-vm-{SandboxID}" — required when the snapshot's saved config.json
|
|
// points at a different sandbox's tmpfs path (i.e. snapshot-template launch).
|
|
func (m *Manager) buildRestoreVMConfig(in restoreInputs) vm.VMConfig {
|
|
return vm.VMConfig{
|
|
SandboxID: in.sandboxID,
|
|
TemplateID: in.templateID,
|
|
KernelPath: m.cfg.KernelPath,
|
|
RootfsPath: in.rootfsPath,
|
|
VCPUs: in.vcpus,
|
|
MemoryMB: in.memoryMB,
|
|
NetworkNamespace: in.slot.NamespaceID,
|
|
TapDevice: in.slot.TapName,
|
|
TapMAC: in.slot.TapMAC,
|
|
GuestIP: in.slot.GuestIP,
|
|
GatewayIP: in.slot.TapIP,
|
|
NetMask: in.slot.GuestNetMask,
|
|
VMMBin: m.cfg.VMMBin,
|
|
LogDir: filepath.Join(m.cfg.WrennDir, "logs"),
|
|
RestoreFromDir: in.snapDir,
|
|
RestoreLazyMemory: true,
|
|
SandboxDir: in.sandboxDir,
|
|
}
|
|
}
|
|
|
|
// launchRestoredVM starts CH in restore mode, resumes the vCPUs, waits for
|
|
// envd to be reachable, then best-effort deflates the balloon. On any failure
|
|
// the partial VM is destroyed before returning — the caller is responsible
|
|
// for tearing down dm/network/slot.
|
|
//
|
|
// Returns the connected envd client on success.
|
|
func (m *Manager) launchRestoredVM(ctx context.Context, vmCfg vm.VMConfig, hostIP string) (*envdclient.Client, error) {
|
|
if _, err := m.vm.CreateFromSnapshot(ctx, vmCfg); err != nil {
|
|
return nil, fmt.Errorf("create from snapshot: %w", err)
|
|
}
|
|
|
|
if err := m.vm.Resume(ctx, vmCfg.SandboxID); err != nil {
|
|
_ = m.vm.Destroy(context.Background(), vmCfg.SandboxID)
|
|
return nil, fmt.Errorf("vm resume: %w", err)
|
|
}
|
|
|
|
client := envdclient.New(hostIP)
|
|
waitCtx, waitCancel := context.WithTimeout(ctx, envdReadyTimeout(vmCfg.MemoryMB))
|
|
defer waitCancel()
|
|
if err := client.WaitUntilReady(waitCtx); err != nil {
|
|
_ = m.vm.Destroy(context.Background(), vmCfg.SandboxID)
|
|
return nil, fmt.Errorf("wait envd: %w", err)
|
|
}
|
|
|
|
// Best-effort balloon deflate. Free-page reporting drains pages while the
|
|
// sandbox runs; the resumed guest needs its full memory budget back. A
|
|
// failure leaves the guest memory-starved but doesn't break correctness.
|
|
if err := m.vm.UpdateBalloon(ctx, vmCfg.SandboxID, 0); err != nil {
|
|
slog.Warn("balloon deflate after restore failed", "id", vmCfg.SandboxID, "error", err)
|
|
}
|
|
|
|
return client, nil
|
|
}
|
|
|
|
// initAndStartMemoryLoader runs envd's /init lifecycle bump and then kicks
|
|
// off the background memory loader. Ordering matters: /init resets envd's
|
|
// mem_preload_* atomics, so the loader's POST /memory/preload must land
|
|
// after — otherwise the next CreateSnapshot/Pause would observe a stale
|
|
// "idle" state and snapshot a memfile full of holes.
|
|
//
|
|
// Must be called with sb already registered in m.boxes with StatusRunning
|
|
// and sb.client populated.
|
|
func (m *Manager) initAndStartMemoryLoader(ctx context.Context, sb *sandboxState, defaultUser, templateIDStr string, envVars map[string]string) {
|
|
initCtx, initCancel := context.WithTimeout(ctx, m.cfg.EnvdTimeout)
|
|
defer initCancel()
|
|
c := sb.client.Load()
|
|
if c == nil {
|
|
slog.Warn("post-restore PostInit skipped: envd client cleared", "id", sb.ID)
|
|
return
|
|
}
|
|
if err := c.PostInitWithDefaults(initCtx, defaultUser, envVars, sb.ID, templateIDStr); err != nil {
|
|
slog.Warn("post-restore PostInit failed", "id", sb.ID, "error", err)
|
|
}
|
|
|
|
m.startMemoryLoader(sb)
|
|
}
|