1
0
forked from wrenn/wrenn
Co-authored-by: Tasnim Kabir Sadik <tksadik@omukk.dev>

Reviewed-on: wrenn/wrenn#50
This commit is contained in:
2026-05-24 21:10:37 +00:00
parent 4707f16c76
commit 05ddf62399
203 changed files with 15815 additions and 9344 deletions

View File

@ -0,0 +1,28 @@
package sandbox
import (
"fmt"
"os/exec"
"strings"
)
// DetectCHVersion runs the cloud-hypervisor binary with --version and
// parses the semver from the output (e.g. "cloud-hypervisor v43.0" → "43.0").
func DetectCHVersion(binaryPath string) (string, error) {
out, err := exec.Command(binaryPath, "--version").Output()
if err != nil {
return "", fmt.Errorf("run %s --version: %w", binaryPath, err)
}
line := strings.TrimSpace(string(out))
for field := range strings.FieldsSeq(line) {
v := strings.TrimPrefix(field, "v")
if v != field || strings.Contains(field, ".") {
if strings.Count(v, ".") >= 1 {
return v, nil
}
}
}
return "", fmt.Errorf("could not parse version from cloud-hypervisor output: %q", line)
}

View File

@ -10,12 +10,22 @@ import (
// ConnTracker tracks active proxy connections for a single sandbox and
// provides a drain mechanism for pre-pause graceful shutdown.
// It is safe for concurrent use.
//
// Internally we do not use sync.WaitGroup because Wait cannot be interrupted
// — a stuck handler would pin the waiter goroutine forever. Instead we keep
// an explicit counter guarded by mu plus a zeroCh that is closed when the
// counter transitions to 0, allowing Drain/ForceClose to select on it
// alongside cancellation and timeout signals without spawning helper
// goroutines that could leak across Reset boundaries.
type ConnTracker struct {
draining atomic.Bool
wg sync.WaitGroup
mu sync.Mutex
count int
zeroCh chan struct{} // closed when count drops to 0; recreated on next Acquire
// cancelMu protects cancelDrain so Reset can signal a timed-out Drain
// goroutine to exit, preventing goroutine leaks on repeated pause failures.
// to exit early.
cancelMu sync.Mutex
cancelDrain chan struct{}
@ -40,13 +50,18 @@ func (t *ConnTracker) Acquire() bool {
if t.draining.Load() {
return false
}
t.wg.Add(1)
// Re-check after Add: Drain may have set draining between our Load
// and Add. If so, undo the Add and reject the connection.
t.mu.Lock()
// Re-check under mu so a concurrent Drain that flipped draining cannot
// race past us with the counter already incremented.
if t.draining.Load() {
t.wg.Done()
t.mu.Unlock()
return false
}
t.count++
if t.count == 1 {
t.zeroCh = make(chan struct{})
}
t.mu.Unlock()
return true
}
@ -63,11 +78,32 @@ func (t *ConnTracker) Context() context.Context {
// Release marks one connection as complete. Must be called exactly once
// per successful Acquire.
func (t *ConnTracker) Release() {
t.wg.Done()
t.mu.Lock()
t.count--
if t.count == 0 && t.zeroCh != nil {
close(t.zeroCh)
t.zeroCh = nil
}
t.mu.Unlock()
}
// waitDrain returns a channel that closes when the in-flight count is zero,
// or a closed channel immediately if there's nothing in flight.
func (t *ConnTracker) waitDrain() <-chan struct{} {
t.mu.Lock()
defer t.mu.Unlock()
if t.count == 0 {
ch := make(chan struct{})
close(ch)
return ch
}
return t.zeroCh
}
// Drain marks the tracker as draining (all future Acquire calls return
// false) and waits up to timeout for in-flight connections to finish.
// Returns when the count hits 0, Reset is called, or the timeout fires —
// whichever happens first. No goroutine is leaked on timeout.
func (t *ConnTracker) Drain(timeout time.Duration) {
t.draining.Store(true)
@ -76,16 +112,9 @@ func (t *ConnTracker) Drain(timeout time.Duration) {
t.cancelDrain = cancel
t.cancelMu.Unlock()
done := make(chan struct{})
go func() {
t.wg.Wait()
close(done)
}()
select {
case <-done:
case <-t.waitDrain():
case <-cancel:
// Reset was called; stop waiting.
case <-time.After(timeout):
}
}
@ -101,22 +130,16 @@ func (t *ConnTracker) ForceClose() {
}
t.ctxMu.Unlock()
// Wait briefly for force-closed connections to call Release().
done := make(chan struct{})
go func() {
t.wg.Wait()
close(done)
}()
select {
case <-done:
case <-t.waitDrain():
case <-time.After(2 * time.Second):
}
}
// Reset re-enables the tracker after a failed drain. This allows the
// sandbox to accept proxy connections again if the pause operation fails
// and the VM is resumed. It also cancels any lingering Drain goroutine
// and creates a fresh context for new connections.
// and the VM is resumed. It also signals any lingering Drain to exit and
// creates a fresh context for new connections.
func (t *ConnTracker) Reset() {
t.cancelMu.Lock()
if t.cancelDrain != nil {
@ -130,7 +153,6 @@ func (t *ConnTracker) Reset() {
}
t.cancelMu.Unlock()
// Replace the cancelled context with a fresh one.
t.ctxMu.Lock()
t.ctx, t.cancel = context.WithCancel(context.Background())
t.ctxMu.Unlock()

View File

@ -1,30 +0,0 @@
package sandbox
import (
"fmt"
"os/exec"
"strings"
)
// DetectFirecrackerVersion runs the firecracker binary with --version and
// parses the semver from the output (e.g. "Firecracker v1.14.1" → "1.14.1").
func DetectFirecrackerVersion(binaryPath string) (string, error) {
out, err := exec.Command(binaryPath, "--version").Output()
if err != nil {
return "", fmt.Errorf("run %s --version: %w", binaryPath, err)
}
// Output is typically "Firecracker v1.14.1\n" or similar.
line := strings.TrimSpace(string(out))
for _, field := range strings.Fields(line) {
v := strings.TrimPrefix(field, "v")
if v != field || strings.Contains(field, ".") {
// Either had a "v" prefix or contains a dot — likely the version.
if strings.Count(v, ".") >= 1 {
return v, nil
}
}
}
return "", fmt.Errorf("could not parse version from firecracker output: %q", line)
}

View File

@ -9,6 +9,8 @@ import (
"strconv"
"strings"
"github.com/jackc/pgx/v5/pgtype"
"git.omukk.dev/wrenn/wrenn/internal/layout"
"git.omukk.dev/wrenn/wrenn/pkg/id"
)
@ -29,13 +31,9 @@ func EnsureImageSizes(wrennDir string, targetMB int) error {
}
targetBytes := int64(targetMB) * 1024 * 1024
// Expand the built-in minimal image.
minimalRootfs := layout.TemplateRootfs(wrennDir, id.PlatformTeamID, id.MinimalTemplateID)
if err := expandImage(minimalRootfs, targetBytes, targetMB); err != nil {
return err
}
// Walk teams/{teamDir}/{templateDir}/rootfs.ext4 two levels deep.
// Walk teams/{teamDir}/{templateDir}/rootfs.ext4 two levels deep. The
// built-in system base templates live under teams/{base36(0)}/... so this
// covers them too.
teamsDir := layout.TeamsDir(wrennDir)
teamEntries, err := os.ReadDir(teamsDir)
if err != nil {
@ -104,12 +102,19 @@ func ParseSizeToMB(s string) (int, error) {
}
}
// ShrinkMinimalImage shrinks the built-in minimal rootfs back to its minimum
// size using resize2fs -M. This is the inverse of EnsureImageSizes and should
// be called during graceful shutdown so the image is stored compactly on disk.
func ShrinkMinimalImage(wrennDir string) {
minimalRootfs := layout.TemplateRootfs(wrennDir, id.PlatformTeamID, id.MinimalTemplateID)
shrinkImage(minimalRootfs)
// ShrinkSystemImages shrinks the built-in system base rootfs images back to
// their minimum size using resize2fs -M. This is the inverse of
// EnsureImageSizes and should be called during graceful shutdown so the images
// are stored compactly on disk.
func ShrinkSystemImages(wrennDir string) {
for _, tmplID := range []pgtype.UUID{
id.UbuntuTemplateID,
id.AlpineTemplateID,
id.ArchTemplateID,
id.FedoraTemplateID,
} {
shrinkImage(layout.TemplateRootfs(wrennDir, id.PlatformTeamID, tmplID))
}
}
// shrinkImage shrinks a single rootfs image to its minimum size.

View File

@ -0,0 +1,187 @@
// Package sandbox: launching a fresh sandbox from a snapshot template.
//
// Mirrors the pause/resume restore path but produces a brand-new sandbox each
// call: fresh ID, fresh network slot, fresh CoW on top of the template's
// flattened rootfs. The CH process is launched with --restore + lazy memory
// (UFFD), and the post-restore memory loader is started so any subsequent
// CreateSnapshot taken from this descendant is self-contained (the
// pause-resume-pause chain guarantee, applied to template lineages).
package sandbox
import (
"context"
"fmt"
"log/slog"
"os"
"time"
"github.com/jackc/pgx/v5/pgtype"
"git.omukk.dev/wrenn/wrenn/internal/devicemapper"
"git.omukk.dev/wrenn/wrenn/internal/layout"
"git.omukk.dev/wrenn/wrenn/internal/models"
"git.omukk.dev/wrenn/wrenn/internal/network"
"git.omukk.dev/wrenn/wrenn/pkg/id"
)
// createFromSnapshotTemplate launches a new sandbox from a snapshot-template
// directory (state.json + config.json + memory-ranges + rootfs.ext4).
//
// The caller has already verified IsSnapshotTemplate(templateDir). Resources
// acquired here are rolled back on any failure; on success the sandbox is
// registered in m.boxes and runs in StatusRunning.
func (m *Manager) createFromSnapshotTemplate(
ctx context.Context,
sandboxID string,
teamID, templateID pgtype.UUID,
vcpus, memoryMB, timeoutSec, diskSizeMB int,
defaultUser string,
defaultEnv map[string]string,
) (*models.Sandbox, int64, error) {
templateDir := layout.TemplateDir(m.cfg.WrennDir, teamID, templateID)
baseRootfs := layout.TemplateRootfs(m.cfg.WrennDir, teamID, templateID)
meta, err := readSnapshotMeta(templateDir)
if err != nil {
return nil, 0, fmt.Errorf("read snapshot meta: %w", err)
}
if meta.SandboxDir == "" {
// CH's saved config.json hardcodes a tmpfs disk path; meta.SandboxDir
// is that exact path. A snapshot template without it cannot be launched.
return nil, 0, fmt.Errorf("snapshot template %s missing sandbox_dir in meta", templateDir)
}
// Acquire shared read-only loop on the flattened rootfs. Many sandboxes
// can share this loop concurrently — refcounted in LoopRegistry.
originLoop, err := m.loops.Acquire(baseRootfs)
if err != nil {
return nil, 0, fmt.Errorf("acquire loop: %w", err)
}
originSize, err := devicemapper.OriginSizeBytes(originLoop)
if err != nil {
m.loops.Release(baseRootfs)
return nil, 0, fmt.Errorf("origin size: %w", err)
}
// Per-sandbox CoW on top of the shared origin.
dmName := "wrenn-" + sandboxID
if err := os.MkdirAll(layout.SandboxDir(m.cfg.WrennDir, sandboxID), 0o755); err != nil {
m.loops.Release(baseRootfs)
return nil, 0, fmt.Errorf("create sandbox dir: %w", err)
}
cowPath := layout.SandboxCowPath(m.cfg.WrennDir, sandboxID)
cowSize := max(int64(diskSizeMB)*1024*1024, originSize)
dmDev, err := devicemapper.CreateSnapshot(dmName, originLoop, cowPath, originSize, cowSize)
if err != nil {
m.loops.Release(baseRootfs)
return nil, 0, fmt.Errorf("create dm-snapshot: %w", err)
}
res := &createResources{
sandboxID: sandboxID,
loops: m.loops,
loopImage: baseRootfs,
dmDevice: dmDev,
cowPath: cowPath,
slots: m.slots,
}
slotIdx, err := m.slots.Allocate()
if err != nil {
res.rollback()
return nil, 0, fmt.Errorf("allocate network slot: %w", err)
}
res.slotIdx = slotIdx
slot := network.NewSlot(slotIdx)
if err := network.CreateNetwork(slot); err != nil {
res.rollback()
return nil, 0, fmt.Errorf("create network: %w", err)
}
res.slot = slot
// CH's saved config.json hardcodes a tmpfs disk path; meta.SandboxDir is
// that exact path (carried forward verbatim across template chains, so a
// snapshot-of-a-snapshot resolves to the root ancestor's path). The
// launcher mounts a fresh tmpfs there inside its private mount namespace
// and symlinks rootfs.ext4 → our new dm device.
vmCfg := m.buildRestoreVMConfig(restoreInputs{
sandboxID: sandboxID,
templateID: id.UUIDString(templateID),
snapDir: templateDir,
rootfsPath: dmDev.DevicePath,
vcpus: vcpus,
memoryMB: memoryMB,
slot: slot,
sandboxDir: meta.SandboxDir,
})
client, err := m.launchRestoredVM(ctx, vmCfg, slot.HostIP.String())
if err != nil {
res.rollback()
return nil, 0, err
}
res.vm = m.vm
envdVersion, _ := client.FetchVersion(ctx)
now := time.Now()
sb := &sandboxState{
Sandbox: models.Sandbox{
ID: sandboxID,
Status: models.StatusRunning,
TemplateTeamID: teamID.Bytes,
TemplateID: templateID.Bytes,
VCPUs: vcpus,
MemoryMB: memoryMB,
TimeoutSec: timeoutSec,
SlotIndex: slotIdx,
HostIP: slot.HostIP,
RootfsPath: dmDev.DevicePath,
CreatedAt: now,
LastActiveAt: now,
Metadata: m.buildMetadata(envdVersion),
},
slot: slot,
connTracker: &ConnTracker{},
dmDevice: dmDev,
baseImagePath: baseRootfs,
sandboxDirOverride: meta.SandboxDir,
}
sb.client.Store(client)
m.mu.Lock()
m.boxes[sandboxID] = sb
m.mu.Unlock()
// /init lifecycle bump then start the memory loader. Loader is required
// so any future CreateSnapshot taken from this descendant captures all
// guest pages (otherwise SEEK_DATA/SEEK_HOLE would emit holes for the
// still-lazy UFFD pages — silent corruption across template chains).
m.initAndStartMemoryLoader(ctx, sb, defaultUser, id.UUIDString(templateID), defaultEnv)
m.startSampler(sb)
m.startCrashWatcher(sb)
slog.Info("sandbox launched from snapshot template",
"id", sandboxID,
"team_id", teamID,
"template_id", templateID,
"sandbox_dir", meta.SandboxDir,
"host_ip", slot.HostIP.String(),
"dm_device", dmDev.DevicePath,
)
return &sb.Sandbox, cowSize, nil
}
// templateExists returns true if a snapshot template already lives at
// TemplateDir(team, templateID). Used by CreateSnapshot to refuse silent
// overwrites — every snapshot must land in a fresh templateID.
func (m *Manager) templateExists(teamID, templateID pgtype.UUID) bool {
dir := layout.TemplateDir(m.cfg.WrennDir, teamID, templateID)
if _, err := os.Stat(dir); err != nil {
return false
}
return layout.IsSnapshotTemplate(dir)
}

File diff suppressed because it is too large Load Diff

1180
internal/sandbox/pause.go Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1,13 +1,14 @@
package sandbox
import (
"context"
"encoding/json"
"fmt"
"io"
"net/http"
"os"
"strconv"
"strings"
"syscall"
"git.omukk.dev/wrenn/wrenn/internal/envdclient"
)
@ -48,42 +49,43 @@ func readCPUStat(pid int) (cpuStat, error) {
return cpuStat{utime: utime, stime: stime}, nil
}
// readEnvdMemUsed fetches mem_used from envd's /metrics endpoint. Returns
// guest-side total - MemAvailable (actual process memory, excluding reclaimable
// page cache). VmRSS of the Firecracker process includes guest page cache and
// never decreases, so this is the accurate metric for dashboard display.
func readEnvdMemUsed(client *envdclient.Client) (int64, error) {
resp, err := client.HTTPClient().Get(client.BaseURL() + "/metrics")
// envdMetrics holds metric values read from envd's /metrics endpoint.
type envdMetrics struct {
MemBytes int64
DiskBytes int64
}
// readEnvdMetrics fetches mem_used and disk_used from envd's /metrics endpoint.
// Returns guest-side process memory (total - available) and filesystem usage
// from statfs("/"). These are the guest-visible metrics users care about.
func readEnvdMetrics(ctx context.Context, client *envdclient.Client) (envdMetrics, error) {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, client.BaseURL()+"/metrics", nil)
if err != nil {
return 0, fmt.Errorf("fetch envd metrics: %w", err)
return envdMetrics{}, fmt.Errorf("build metrics request: %w", err)
}
resp, err := client.HTTPClient().Do(req)
if err != nil {
return envdMetrics{}, fmt.Errorf("fetch envd metrics: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != 200 {
return 0, fmt.Errorf("envd metrics: status %d", resp.StatusCode)
return envdMetrics{}, fmt.Errorf("envd metrics: status %d", resp.StatusCode)
}
body, err := io.ReadAll(resp.Body)
if err != nil {
return 0, fmt.Errorf("read envd metrics body: %w", err)
return envdMetrics{}, fmt.Errorf("read envd metrics body: %w", err)
}
var m struct {
MemUsed int64 `json:"mem_used"`
MemUsed int64 `json:"mem_used"`
DiskUsed int64 `json:"disk_used"`
}
if err := json.Unmarshal(body, &m); err != nil {
return 0, fmt.Errorf("decode envd metrics: %w", err)
return envdMetrics{}, fmt.Errorf("decode envd metrics: %w", err)
}
return m.MemUsed, nil
}
// readDiskAllocated returns the actual allocated bytes (not apparent size)
// of the file at path. This uses stat's block count × 512.
func readDiskAllocated(path string) (int64, error) {
var stat syscall.Stat_t
if err := syscall.Stat(path, &stat); err != nil {
return 0, fmt.Errorf("stat %s: %w", path, err)
}
return stat.Blocks * 512, nil
return envdMetrics{MemBytes: m.MemUsed, DiskBytes: m.DiskUsed}, nil
}

186
internal/sandbox/punch.go Normal file
View File

@ -0,0 +1,186 @@
// Package sandbox: post-snapshot hole punching for memory-ranges files.
//
// CH v52's SEEK_DATA/SEEK_HOLE snapshot writer only skips ranges already
// hole in the source memfd. Pages the guest never reported as free are
// written verbatim — including pages whose contents happen to be all zero
// (fresh allocations the guest scribbled then released without telling the
// balloon driver). Walking the resulting file and punching any 4 KiB block
// of zeros recovers that space without any guest cooperation.
package sandbox
import (
"errors"
"fmt"
"io"
"log/slog"
"os"
"path/filepath"
"strings"
"golang.org/x/sys/unix"
)
const (
// punchBlockSize is the granularity at which we test for zero runs and
// issue FALLOC_FL_PUNCH_HOLE. Matches the kernel page size and the
// minimum hole size on ext4.
punchBlockSize = 4096
// punchReadSize is the IO chunk size used by the scan loop. We read
// many blocks per syscall and split them in-memory so a 20 GiB
// memory-ranges file costs ~20K read(2) syscalls instead of ~5M.
// Crucial under single-disk hosts where each syscall otherwise
// contends with sshd / journal IO.
punchReadSize = 1 << 20 // 1 MiB = 256 blocks
)
// punchZeroPagesInDir runs punchZeroPages on every memory* file in dir.
// CH writes its memory dump as one or more files prefixed "memory" inside
// the snapshot directory; everything else (config.json, state.json) is
// metadata and untouched.
func punchZeroPagesInDir(dir string) {
entries, err := os.ReadDir(dir)
if err != nil {
slog.Warn("punch: read snapshot dir", "dir", dir, "error", err)
return
}
for _, e := range entries {
if e.IsDir() || !strings.HasPrefix(e.Name(), "memory") {
continue
}
path := filepath.Join(dir, e.Name())
before, after, err := punchZeroPages(path)
if err != nil {
slog.Warn("punch: zero-page scan failed", "path", path, "error", err)
continue
}
slog.Info("punch: zero-page scan done",
"path", path,
"alloc_before", before,
"alloc_after", after,
"reclaimed", before-after)
}
}
// punchZeroPages scans path block-by-block, batching runs of all-zero 4 KiB
// blocks and punching them out via FALLOC_FL_PUNCH_HOLE. Existing holes are
// skipped via SEEK_DATA so a partially-sparse input stays cheap to scan.
//
// Returns the file's disk allocation (st_blocks * 512) before and after.
func punchZeroPages(path string) (int64, int64, error) {
f, err := os.OpenFile(path, os.O_RDWR, 0)
if err != nil {
return 0, 0, err
}
defer f.Close()
stBefore, err := statBlocks(f)
if err != nil {
return 0, 0, fmt.Errorf("stat before: %w", err)
}
fi, err := f.Stat()
if err != nil {
return 0, 0, fmt.Errorf("stat: %w", err)
}
size := fi.Size()
buf := make([]byte, punchReadSize)
off := int64(0)
for off < size {
// Skip ahead to next data region; nothing to do in holes.
next, err := f.Seek(off, 3) // SEEK_DATA = 3
if err != nil {
if errors.Is(err, io.EOF) || errors.Is(err, unix.ENXIO) {
break
}
return 0, 0, fmt.Errorf("seek_data @ %d: %w", off, err)
}
off = next &^ (punchBlockSize - 1) // align down to block
// Find end of this data extent.
endData, err := f.Seek(off, 4) // SEEK_HOLE = 4
if err != nil {
return 0, 0, fmt.Errorf("seek_hole @ %d: %w", off, err)
}
// Scan [off, endData) chunk by chunk; batch zero runs across both
// intra-chunk and inter-chunk boundaries so a contiguous zero
// region is punched in a single fallocate.
zeroStart := int64(-1)
cur := off
for cur < endData {
toRead := min(int64(len(buf)), endData-cur)
n, err := readAt(f, buf[:toRead], cur)
if err != nil {
return 0, 0, fmt.Errorf("read @ %d: %w", cur, err)
}
if n == 0 {
break
}
// Walk the chunk one block at a time, tracking zero runs.
for blkOff := 0; blkOff < n; blkOff += punchBlockSize {
blkEnd := min(blkOff+punchBlockSize, n)
blk := buf[blkOff:blkEnd]
blkAbs := cur + int64(blkOff)
if isZero(blk) && len(blk) == punchBlockSize {
if zeroStart < 0 {
zeroStart = blkAbs
}
} else if zeroStart >= 0 {
if err := punch(f, zeroStart, blkAbs-zeroStart); err != nil {
return 0, 0, err
}
zeroStart = -1
}
}
cur += int64(n)
}
if zeroStart >= 0 {
if err := punch(f, zeroStart, cur-zeroStart); err != nil {
return 0, 0, err
}
}
off = endData
}
stAfter, err := statBlocks(f)
if err != nil {
return 0, 0, fmt.Errorf("stat after: %w", err)
}
return stBefore, stAfter, nil
}
func punch(f *os.File, off, length int64) error {
mode := uint32(unix.FALLOC_FL_PUNCH_HOLE | unix.FALLOC_FL_KEEP_SIZE)
if err := unix.Fallocate(int(f.Fd()), mode, off, length); err != nil {
return fmt.Errorf("fallocate punch @ %d len %d: %w", off, length, err)
}
return nil
}
func readAt(f *os.File, buf []byte, off int64) (int, error) {
n, err := f.ReadAt(buf, off)
if err == io.EOF {
return n, nil
}
return n, err
}
func isZero(b []byte) bool {
for _, x := range b {
if x != 0 {
return false
}
}
return true
}
func statBlocks(f *os.File) (int64, error) {
var st unix.Stat_t
if err := unix.Fstat(int(f.Fd()), &st); err != nil {
return 0, err
}
return int64(st.Blocks) * 512, nil
}

118
internal/sandbox/restore.go Normal file
View File

@ -0,0 +1,118 @@
// Package sandbox: shared CH-restore helpers used by both Resume (paused →
// running) and the snapshot-template launch path (template → fresh sandbox).
//
// The two callers diverge in how they acquire resources (slot, dm-snapshot,
// sandbox identity) but converge on:
//
// build VMConfig → CreateFromSnapshot → vm.Resume → wait envd → balloon deflate
//
// These steps are extracted here so the sequence — and its quirks (paused
// post-restore state, balloon best-effort, restored disk path baked into
// CH's config.json) — has a single source of truth.
package sandbox
import (
"context"
"fmt"
"log/slog"
"path/filepath"
"git.omukk.dev/wrenn/wrenn/internal/envdclient"
"git.omukk.dev/wrenn/wrenn/internal/network"
"git.omukk.dev/wrenn/wrenn/internal/vm"
)
// restoreInputs is the common set of fields needed to build a restore VMConfig.
type restoreInputs struct {
sandboxID string // VM identity for the new CH process (sock path, log file)
templateID string // forwarded to envd via PostInit (informational)
snapDir string // directory containing CH snapshot artefacts
rootfsPath string // /dev/mapper/wrenn-{newID} — per-sandbox dm-snapshot
vcpus int
memoryMB int
slot *network.Slot
sandboxDir string // override for VMConfig.SandboxDir; "" = default
}
// buildRestoreVMConfig assembles the VMConfig used to launch a CH process in
// restore mode. sandboxDir, when non-empty, overrides the default
// "/tmp/ch-vm-{SandboxID}" — required when the snapshot's saved config.json
// points at a different sandbox's tmpfs path (i.e. snapshot-template launch).
func (m *Manager) buildRestoreVMConfig(in restoreInputs) vm.VMConfig {
return vm.VMConfig{
SandboxID: in.sandboxID,
TemplateID: in.templateID,
KernelPath: m.cfg.KernelPath,
RootfsPath: in.rootfsPath,
VCPUs: in.vcpus,
MemoryMB: in.memoryMB,
NetworkNamespace: in.slot.NamespaceID,
TapDevice: in.slot.TapName,
TapMAC: in.slot.TapMAC,
GuestIP: in.slot.GuestIP,
GatewayIP: in.slot.TapIP,
NetMask: in.slot.GuestNetMask,
VMMBin: m.cfg.VMMBin,
LogDir: filepath.Join(m.cfg.WrennDir, "logs"),
RestoreFromDir: in.snapDir,
RestoreLazyMemory: true,
SandboxDir: in.sandboxDir,
}
}
// launchRestoredVM starts CH in restore mode, resumes the vCPUs, waits for
// envd to be reachable, then best-effort deflates the balloon. On any failure
// the partial VM is destroyed before returning — the caller is responsible
// for tearing down dm/network/slot.
//
// Returns the connected envd client on success.
func (m *Manager) launchRestoredVM(ctx context.Context, vmCfg vm.VMConfig, hostIP string) (*envdclient.Client, error) {
if _, err := m.vm.CreateFromSnapshot(ctx, vmCfg); err != nil {
return nil, fmt.Errorf("create from snapshot: %w", err)
}
if err := m.vm.Resume(ctx, vmCfg.SandboxID); err != nil {
_ = m.vm.Destroy(context.Background(), vmCfg.SandboxID)
return nil, fmt.Errorf("vm resume: %w", err)
}
client := envdclient.New(hostIP)
waitCtx, waitCancel := context.WithTimeout(ctx, envdReadyTimeout(vmCfg.MemoryMB))
defer waitCancel()
if err := client.WaitUntilReady(waitCtx); err != nil {
_ = m.vm.Destroy(context.Background(), vmCfg.SandboxID)
return nil, fmt.Errorf("wait envd: %w", err)
}
// Best-effort balloon deflate. Free-page reporting drains pages while the
// sandbox runs; the resumed guest needs its full memory budget back. A
// failure leaves the guest memory-starved but doesn't break correctness.
if err := m.vm.UpdateBalloon(ctx, vmCfg.SandboxID, 0); err != nil {
slog.Warn("balloon deflate after restore failed", "id", vmCfg.SandboxID, "error", err)
}
return client, nil
}
// initAndStartMemoryLoader runs envd's /init lifecycle bump and then kicks
// off the background memory loader. Ordering matters: /init resets envd's
// mem_preload_* atomics, so the loader's POST /memory/preload must land
// after — otherwise the next CreateSnapshot/Pause would observe a stale
// "idle" state and snapshot a memfile full of holes.
//
// Must be called with sb already registered in m.boxes with StatusRunning
// and sb.client populated.
func (m *Manager) initAndStartMemoryLoader(ctx context.Context, sb *sandboxState, defaultUser, templateIDStr string, envVars map[string]string) {
initCtx, initCancel := context.WithTimeout(ctx, m.cfg.EnvdTimeout)
defer initCancel()
c := sb.client.Load()
if c == nil {
slog.Warn("post-restore PostInit skipped: envd client cleared", "id", sb.ID)
return
}
if err := c.PostInitWithDefaults(initCtx, defaultUser, envVars, sb.ID, templateIDStr); err != nil {
slog.Warn("post-restore PostInit failed", "id", sb.ID, "error", err)
}
m.startMemoryLoader(sb)
}

View File

@ -0,0 +1,208 @@
package sandbox
import (
"fmt"
"log/slog"
"os"
"path/filepath"
"sort"
"strings"
"time"
"github.com/google/uuid"
"git.omukk.dev/wrenn/wrenn/internal/layout"
"git.omukk.dev/wrenn/wrenn/internal/models"
)
// RestorePausedSandboxes scans WRENN_DIR/sandboxes/ for paused-sandbox
// snapshots left behind by a previous agent instance and re-registers them
// in m.boxes as StatusPaused. Without this, ListSandboxes would not report
// these sandboxes, and the CP's HostMonitor would mark them stopped via
// the missing-confirmed-dead reconcile path — orphaning the on-disk
// snapshot dir and surfacing a leaked "stopped" sandbox to users.
//
// Restored sandboxes hold ONLY the slot reservation; VM / network / dm /
// loop refcount stay unowned until Resume rebuilds them. baseImagePath is
// deliberately NOT set on the in-memory entry so cleanup() does not call
// loops.Release on a loop that was never Acquire'd — the registry tolerates
// a Release of an unknown key, but a coincident-same-base running sandbox
// would have its refcount decremented incorrectly.
//
// Must be called once at agent startup, AFTER CleanupOrphanPauseDirs (so
// .staging-* / .trash-* dirs are gone) and BEFORE the HTTP server starts
// serving — otherwise an early Create RPC can race the slot reservation.
//
// Corrupt snapshot dirs (unparseable meta, missing slot index) are renamed
// to .trash-{ts}/ so a future CleanupOrphanPauseDirs sweeps them. Soft
// errors are logged; this function never returns an error — startup should
// not fail because a single sandbox is unrecoverable.
func (m *Manager) RestorePausedSandboxes() {
sandboxesDir := layout.SandboxesDir(m.cfg.WrennDir)
entries, err := os.ReadDir(sandboxesDir)
if err != nil {
// Directory does not exist yet — fresh install, nothing to restore.
return
}
type candidate struct {
sandboxID string
snapDir string
meta *snapshotMeta
teamID [16]byte
templID [16]byte
}
// Pass 1: parse every snapshot meta. Trash anything unreadable or
// missing the slot index — those are crash artefacts, not recoverable
// sandboxes.
candidates := make([]candidate, 0, len(entries))
for _, e := range entries {
if !e.IsDir() {
continue
}
name := e.Name()
// Skip CleanupOrphanPauseDirs's territory. If it ran before us
// these are already gone; if not, leave them alone.
if strings.Contains(name, ".staging-") || strings.Contains(name, ".trash-") {
continue
}
snapDir := layout.PauseSnapshotDir(m.cfg.WrennDir, name)
meta, err := readSnapshotMeta(snapDir)
if err != nil {
slog.Warn("restore: unreadable snapshot meta, trashing dir",
"id", name, "error", err)
trashCorruptDir(snapDir)
continue
}
if meta.SlotIndex == 0 {
slog.Warn("restore: snapshot has no slot_index, trashing dir", "id", name)
trashCorruptDir(snapDir)
continue
}
teamBytes, err := parsePlainUUID(meta.TeamID)
if err != nil {
slog.Warn("restore: bad team_id in snapshot meta", "id", name, "error", err)
trashCorruptDir(snapDir)
continue
}
templateBytes, err := parsePlainUUID(meta.TemplateID)
if err != nil {
slog.Warn("restore: bad template_id in snapshot meta", "id", name, "error", err)
trashCorruptDir(snapDir)
continue
}
candidates = append(candidates, candidate{
sandboxID: name,
snapDir: snapDir,
meta: meta,
teamID: teamBytes,
templID: templateBytes,
})
}
// Pass 2: bucket by slot index, pick the newest CreatedAt per slot.
// Multiple candidates per slot happen when older paused-sandbox dirs
// were left on disk by the pre-fix leak (DB row marked stopped but the
// snapshot was never cleaned). The newest is the most likely live one;
// older losers are trashed so CleanupOrphanPauseDirs sweeps them on
// the next startup.
bySlot := make(map[int][]candidate, len(candidates))
for _, c := range candidates {
bySlot[c.meta.SlotIndex] = append(bySlot[c.meta.SlotIndex], c)
}
restored := 0
pruned := 0
for slot, cands := range bySlot {
sort.Slice(cands, func(i, j int) bool {
return cands[i].meta.CreatedAt.After(cands[j].meta.CreatedAt)
})
// Trash every loser. The host_monitor's zombie-cleanup path catches
// the winner if its DB row says 'stopped' — but losers never enter
// m.boxes and would otherwise sit on disk indefinitely.
for _, stale := range cands[1:] {
slog.Info("restore: pruning older snapshot for same slot",
"id", stale.sandboxID, "slot", slot, "created", stale.meta.CreatedAt,
"winner", cands[0].sandboxID, "winner_created", cands[0].meta.CreatedAt)
trashCorruptDir(stale.snapDir)
pruned++
}
winner := cands[0]
if err := m.slots.Reserve(winner.meta.SlotIndex); err != nil {
// Reserve only fails if another candidate (different slot value
// in meta but same numeric index) already grabbed it, or if the
// allocator is corrupt. Either way the snapshot is unusable
// without a slot, so trash it.
slog.Warn("restore: slot reservation failed, trashing dir",
"id", winner.sandboxID, "slot", winner.meta.SlotIndex, "error", err)
trashCorruptDir(winner.snapDir)
pruned++
continue
}
sb := &sandboxState{
Sandbox: models.Sandbox{
ID: winner.sandboxID,
Status: models.StatusPaused,
TemplateTeamID: winner.teamID,
TemplateID: winner.templID,
VCPUs: winner.meta.VCPUs,
MemoryMB: winner.meta.MemoryMB,
TimeoutSec: winner.meta.TimeoutSec,
SlotIndex: winner.meta.SlotIndex,
CreatedAt: winner.meta.CreatedAt,
// LastActiveAt cosmetic only — TTL reaper ignores non-Running.
LastActiveAt: winner.meta.CreatedAt,
},
// connTracker must be non-nil: resumeFromMeta calls Reset() on it
// unconditionally during rehydration. A nil pointer would panic.
connTracker: &ConnTracker{},
// baseImagePath intentionally left empty — see function doc.
// sandboxDirOverride intentionally left empty — resumeFromMeta
// reads meta.SandboxDir from disk on the resume path.
}
m.mu.Lock()
m.boxes[winner.sandboxID] = sb
m.mu.Unlock()
restored++
slog.Info("restored paused sandbox", "id", winner.sandboxID,
"slot", winner.meta.SlotIndex, "vcpus", winner.meta.VCPUs, "memory_mb", winner.meta.MemoryMB)
}
if restored > 0 || pruned > 0 {
slog.Info("paused sandbox restore complete", "restored", restored, "pruned", pruned)
}
}
// parsePlainUUID turns a standard hyphenated UUID string (as produced by
// id.UUIDString) back into the 16-byte representation used by sandboxState.
func parsePlainUUID(s string) ([16]byte, error) {
if s == "" {
return [16]byte{}, fmt.Errorf("empty uuid string")
}
u, err := uuid.Parse(s)
if err != nil {
return [16]byte{}, err
}
return [16]byte(u), nil
}
// trashCorruptDir renames a corrupt snapshot directory aside so a future
// CleanupOrphanPauseDirs sweeps it. Best-effort: if rename fails we log
// and move on — leaving the directory in place is safe (restore will skip
// it again next startup) but unwanted.
func trashCorruptDir(dir string) {
parent := filepath.Dir(dir)
base := filepath.Base(dir)
trash := filepath.Join(parent, fmt.Sprintf("%s.trash-%d", base, time.Now().UnixNano()))
if err := os.Rename(dir, trash); err != nil {
slog.Warn("restore: failed to trash corrupt snapshot dir",
"src", dir, "dst", trash, "error", err)
}
}