v0.2.0 (#50)

Co-authored-by: Tasnim Kabir Sadik <tksadik@omukk.dev> Reviewed-on: wrenn/wrenn#50
2026-05-24 21:10:37 +00:00
parent 4707f16c76
commit 05ddf62399
203 changed files with 15815 additions and 9344 deletions
--- a/internal/sandbox/chversion.go
+++ b/internal/sandbox/chversion.go
@ -0,0 +1,28 @@
+package sandbox
+
+import (
+	"fmt"
+	"os/exec"
+	"strings"
+)
+
+// DetectCHVersion runs the cloud-hypervisor binary with --version and
+// parses the semver from the output (e.g. "cloud-hypervisor v43.0" → "43.0").
+func DetectCHVersion(binaryPath string) (string, error) {
+	out, err := exec.Command(binaryPath, "--version").Output()
+	if err != nil {
+		return "", fmt.Errorf("run %s --version: %w", binaryPath, err)
+	}
+
+	line := strings.TrimSpace(string(out))
+	for field := range strings.FieldsSeq(line) {
+		v := strings.TrimPrefix(field, "v")
+		if v != field || strings.Contains(field, ".") {
+			if strings.Count(v, ".") >= 1 {
+				return v, nil
+			}
+		}
+	}
+
+	return "", fmt.Errorf("could not parse version from cloud-hypervisor output: %q", line)
+}
--- a/internal/sandbox/conntracker.go
+++ b/internal/sandbox/conntracker.go
@ -10,12 +10,22 @@ import (
 // ConnTracker tracks active proxy connections for a single sandbox and
 // provides a drain mechanism for pre-pause graceful shutdown.
 // It is safe for concurrent use.
+//
+// Internally we do not use sync.WaitGroup because Wait cannot be interrupted
+// — a stuck handler would pin the waiter goroutine forever. Instead we keep
+// an explicit counter guarded by mu plus a zeroCh that is closed when the
+// counter transitions to 0, allowing Drain/ForceClose to select on it
+// alongside cancellation and timeout signals without spawning helper
+// goroutines that could leak across Reset boundaries.
 type ConnTracker struct {
 	draining atomic.Bool
-	wg       sync.WaitGroup
+
+	mu     sync.Mutex
+	count  int
+	zeroCh chan struct{} // closed when count drops to 0; recreated on next Acquire

 	// cancelMu protects cancelDrain so Reset can signal a timed-out Drain
-	// goroutine to exit, preventing goroutine leaks on repeated pause failures.
+	// to exit early.
 	cancelMu    sync.Mutex
 	cancelDrain chan struct{}

@ -40,13 +50,18 @@ func (t *ConnTracker) Acquire() bool {
 	if t.draining.Load() {
 		return false
 	}
-	t.wg.Add(1)
-	// Re-check after Add: Drain may have set draining between our Load
-	// and Add. If so, undo the Add and reject the connection.
+	t.mu.Lock()
+	// Re-check under mu so a concurrent Drain that flipped draining cannot
+	// race past us with the counter already incremented.
 	if t.draining.Load() {
-		t.wg.Done()
+		t.mu.Unlock()
 		return false
 	}
+	t.count++
+	if t.count == 1 {
+		t.zeroCh = make(chan struct{})
+	}
+	t.mu.Unlock()
 	return true
 }

@ -63,11 +78,32 @@ func (t *ConnTracker) Context() context.Context {
 // Release marks one connection as complete. Must be called exactly once
 // per successful Acquire.
 func (t *ConnTracker) Release() {
-	t.wg.Done()
+	t.mu.Lock()
+	t.count--
+	if t.count == 0 && t.zeroCh != nil {
+		close(t.zeroCh)
+		t.zeroCh = nil
+	}
+	t.mu.Unlock()
+}
+
+// waitDrain returns a channel that closes when the in-flight count is zero,
+// or a closed channel immediately if there's nothing in flight.
+func (t *ConnTracker) waitDrain() <-chan struct{} {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	if t.count == 0 {
+		ch := make(chan struct{})
+		close(ch)
+		return ch
+	}
+	return t.zeroCh
 }

 // Drain marks the tracker as draining (all future Acquire calls return
 // false) and waits up to timeout for in-flight connections to finish.
+// Returns when the count hits 0, Reset is called, or the timeout fires —
+// whichever happens first. No goroutine is leaked on timeout.
 func (t *ConnTracker) Drain(timeout time.Duration) {
 	t.draining.Store(true)

@ -76,16 +112,9 @@ func (t *ConnTracker) Drain(timeout time.Duration) {
 	t.cancelDrain = cancel
 	t.cancelMu.Unlock()

-	done := make(chan struct{})
-	go func() {
-		t.wg.Wait()
-		close(done)
-	}()
-
 	select {
-	case <-done:
+	case <-t.waitDrain():
 	case <-cancel:
-		// Reset was called; stop waiting.
 	case <-time.After(timeout):
 	}
 }
@ -101,22 +130,16 @@ func (t *ConnTracker) ForceClose() {
 	}
 	t.ctxMu.Unlock()

-	// Wait briefly for force-closed connections to call Release().
-	done := make(chan struct{})
-	go func() {
-		t.wg.Wait()
-		close(done)
-	}()
 	select {
-	case <-done:
+	case <-t.waitDrain():
 	case <-time.After(2 * time.Second):
 	}
 }

 // Reset re-enables the tracker after a failed drain. This allows the
 // sandbox to accept proxy connections again if the pause operation fails
-// and the VM is resumed. It also cancels any lingering Drain goroutine
-// and creates a fresh context for new connections.
+// and the VM is resumed. It also signals any lingering Drain to exit and
+// creates a fresh context for new connections.
 func (t *ConnTracker) Reset() {
 	t.cancelMu.Lock()
 	if t.cancelDrain != nil {
@ -130,7 +153,6 @@ func (t *ConnTracker) Reset() {
 	}
 	t.cancelMu.Unlock()

-	// Replace the cancelled context with a fresh one.
 	t.ctxMu.Lock()
 	t.ctx, t.cancel = context.WithCancel(context.Background())
 	t.ctxMu.Unlock()
--- a/internal/sandbox/fcversion.go
+++ b/internal/sandbox/fcversion.go
@ -1,30 +0,0 @@
-package sandbox
-
-import (
-	"fmt"
-	"os/exec"
-	"strings"
-)
-
-// DetectFirecrackerVersion runs the firecracker binary with --version and
-// parses the semver from the output (e.g. "Firecracker v1.14.1" → "1.14.1").
-func DetectFirecrackerVersion(binaryPath string) (string, error) {
-	out, err := exec.Command(binaryPath, "--version").Output()
-	if err != nil {
-		return "", fmt.Errorf("run %s --version: %w", binaryPath, err)
-	}
-
-	// Output is typically "Firecracker v1.14.1\n" or similar.
-	line := strings.TrimSpace(string(out))
-	for _, field := range strings.Fields(line) {
-		v := strings.TrimPrefix(field, "v")
-		if v != field || strings.Contains(field, ".") {
-			// Either had a "v" prefix or contains a dot — likely the version.
-			if strings.Count(v, ".") >= 1 {
-				return v, nil
-			}
-		}
-	}
-
-	return "", fmt.Errorf("could not parse version from firecracker output: %q", line)
-}
--- a/internal/sandbox/images.go
+++ b/internal/sandbox/images.go
@ -9,6 +9,8 @@ import (
 	"strconv"
 	"strings"

+	"github.com/jackc/pgx/v5/pgtype"
+
 	"git.omukk.dev/wrenn/wrenn/internal/layout"
 	"git.omukk.dev/wrenn/wrenn/pkg/id"
 )
@ -29,13 +31,9 @@ func EnsureImageSizes(wrennDir string, targetMB int) error {
 	}
 	targetBytes := int64(targetMB) * 1024 * 1024

-	// Expand the built-in minimal image.
-	minimalRootfs := layout.TemplateRootfs(wrennDir, id.PlatformTeamID, id.MinimalTemplateID)
-	if err := expandImage(minimalRootfs, targetBytes, targetMB); err != nil {
-		return err
-	}
-
-	// Walk teams/{teamDir}/{templateDir}/rootfs.ext4 two levels deep.
+	// Walk teams/{teamDir}/{templateDir}/rootfs.ext4 two levels deep. The
+	// built-in system base templates live under teams/{base36(0)}/... so this
+	// covers them too.
 	teamsDir := layout.TeamsDir(wrennDir)
 	teamEntries, err := os.ReadDir(teamsDir)
 	if err != nil {
@ -104,12 +102,19 @@ func ParseSizeToMB(s string) (int, error) {
 	}
 }

-// ShrinkMinimalImage shrinks the built-in minimal rootfs back to its minimum
-// size using resize2fs -M. This is the inverse of EnsureImageSizes and should
-// be called during graceful shutdown so the image is stored compactly on disk.
-func ShrinkMinimalImage(wrennDir string) {
-	minimalRootfs := layout.TemplateRootfs(wrennDir, id.PlatformTeamID, id.MinimalTemplateID)
-	shrinkImage(minimalRootfs)
+// ShrinkSystemImages shrinks the built-in system base rootfs images back to
+// their minimum size using resize2fs -M. This is the inverse of
+// EnsureImageSizes and should be called during graceful shutdown so the images
+// are stored compactly on disk.
+func ShrinkSystemImages(wrennDir string) {
+	for _, tmplID := range []pgtype.UUID{
+		id.UbuntuTemplateID,
+		id.AlpineTemplateID,
+		id.ArchTemplateID,
+		id.FedoraTemplateID,
+	} {
+		shrinkImage(layout.TemplateRootfs(wrennDir, id.PlatformTeamID, tmplID))
+	}
 }

 // shrinkImage shrinks a single rootfs image to its minimum size.
--- a/internal/sandbox/launch_snapshot.go
+++ b/internal/sandbox/launch_snapshot.go
@ -0,0 +1,187 @@
+// Package sandbox: launching a fresh sandbox from a snapshot template.
+//
+// Mirrors the pause/resume restore path but produces a brand-new sandbox each
+// call: fresh ID, fresh network slot, fresh CoW on top of the template's
+// flattened rootfs. The CH process is launched with --restore + lazy memory
+// (UFFD), and the post-restore memory loader is started so any subsequent
+// CreateSnapshot taken from this descendant is self-contained (the
+// pause-resume-pause chain guarantee, applied to template lineages).
+package sandbox
+
+import (
+	"context"
+	"fmt"
+	"log/slog"
+	"os"
+	"time"
+
+	"github.com/jackc/pgx/v5/pgtype"
+
+	"git.omukk.dev/wrenn/wrenn/internal/devicemapper"
+	"git.omukk.dev/wrenn/wrenn/internal/layout"
+	"git.omukk.dev/wrenn/wrenn/internal/models"
+	"git.omukk.dev/wrenn/wrenn/internal/network"
+	"git.omukk.dev/wrenn/wrenn/pkg/id"
+)
+
+// createFromSnapshotTemplate launches a new sandbox from a snapshot-template
+// directory (state.json + config.json + memory-ranges + rootfs.ext4).
+//
+// The caller has already verified IsSnapshotTemplate(templateDir). Resources
+// acquired here are rolled back on any failure; on success the sandbox is
+// registered in m.boxes and runs in StatusRunning.
+func (m *Manager) createFromSnapshotTemplate(
+	ctx context.Context,
+	sandboxID string,
+	teamID, templateID pgtype.UUID,
+	vcpus, memoryMB, timeoutSec, diskSizeMB int,
+	defaultUser string,
+	defaultEnv map[string]string,
+) (*models.Sandbox, int64, error) {
+	templateDir := layout.TemplateDir(m.cfg.WrennDir, teamID, templateID)
+	baseRootfs := layout.TemplateRootfs(m.cfg.WrennDir, teamID, templateID)
+
+	meta, err := readSnapshotMeta(templateDir)
+	if err != nil {
+		return nil, 0, fmt.Errorf("read snapshot meta: %w", err)
+	}
+	if meta.SandboxDir == "" {
+		// CH's saved config.json hardcodes a tmpfs disk path; meta.SandboxDir
+		// is that exact path. A snapshot template without it cannot be launched.
+		return nil, 0, fmt.Errorf("snapshot template %s missing sandbox_dir in meta", templateDir)
+	}
+
+	// Acquire shared read-only loop on the flattened rootfs. Many sandboxes
+	// can share this loop concurrently — refcounted in LoopRegistry.
+	originLoop, err := m.loops.Acquire(baseRootfs)
+	if err != nil {
+		return nil, 0, fmt.Errorf("acquire loop: %w", err)
+	}
+	originSize, err := devicemapper.OriginSizeBytes(originLoop)
+	if err != nil {
+		m.loops.Release(baseRootfs)
+		return nil, 0, fmt.Errorf("origin size: %w", err)
+	}
+
+	// Per-sandbox CoW on top of the shared origin.
+	dmName := "wrenn-" + sandboxID
+	if err := os.MkdirAll(layout.SandboxDir(m.cfg.WrennDir, sandboxID), 0o755); err != nil {
+		m.loops.Release(baseRootfs)
+		return nil, 0, fmt.Errorf("create sandbox dir: %w", err)
+	}
+	cowPath := layout.SandboxCowPath(m.cfg.WrennDir, sandboxID)
+	cowSize := max(int64(diskSizeMB)*1024*1024, originSize)
+	dmDev, err := devicemapper.CreateSnapshot(dmName, originLoop, cowPath, originSize, cowSize)
+	if err != nil {
+		m.loops.Release(baseRootfs)
+		return nil, 0, fmt.Errorf("create dm-snapshot: %w", err)
+	}
+
+	res := &createResources{
+		sandboxID: sandboxID,
+		loops:     m.loops,
+		loopImage: baseRootfs,
+		dmDevice:  dmDev,
+		cowPath:   cowPath,
+		slots:     m.slots,
+	}
+
+	slotIdx, err := m.slots.Allocate()
+	if err != nil {
+		res.rollback()
+		return nil, 0, fmt.Errorf("allocate network slot: %w", err)
+	}
+	res.slotIdx = slotIdx
+	slot := network.NewSlot(slotIdx)
+
+	if err := network.CreateNetwork(slot); err != nil {
+		res.rollback()
+		return nil, 0, fmt.Errorf("create network: %w", err)
+	}
+	res.slot = slot
+
+	// CH's saved config.json hardcodes a tmpfs disk path; meta.SandboxDir is
+	// that exact path (carried forward verbatim across template chains, so a
+	// snapshot-of-a-snapshot resolves to the root ancestor's path). The
+	// launcher mounts a fresh tmpfs there inside its private mount namespace
+	// and symlinks rootfs.ext4 → our new dm device.
+	vmCfg := m.buildRestoreVMConfig(restoreInputs{
+		sandboxID:  sandboxID,
+		templateID: id.UUIDString(templateID),
+		snapDir:    templateDir,
+		rootfsPath: dmDev.DevicePath,
+		vcpus:      vcpus,
+		memoryMB:   memoryMB,
+		slot:       slot,
+		sandboxDir: meta.SandboxDir,
+	})
+
+	client, err := m.launchRestoredVM(ctx, vmCfg, slot.HostIP.String())
+	if err != nil {
+		res.rollback()
+		return nil, 0, err
+	}
+	res.vm = m.vm
+
+	envdVersion, _ := client.FetchVersion(ctx)
+
+	now := time.Now()
+	sb := &sandboxState{
+		Sandbox: models.Sandbox{
+			ID:             sandboxID,
+			Status:         models.StatusRunning,
+			TemplateTeamID: teamID.Bytes,
+			TemplateID:     templateID.Bytes,
+			VCPUs:          vcpus,
+			MemoryMB:       memoryMB,
+			TimeoutSec:     timeoutSec,
+			SlotIndex:      slotIdx,
+			HostIP:         slot.HostIP,
+			RootfsPath:     dmDev.DevicePath,
+			CreatedAt:      now,
+			LastActiveAt:   now,
+			Metadata:       m.buildMetadata(envdVersion),
+		},
+		slot:               slot,
+		connTracker:        &ConnTracker{},
+		dmDevice:           dmDev,
+		baseImagePath:      baseRootfs,
+		sandboxDirOverride: meta.SandboxDir,
+	}
+	sb.client.Store(client)
+
+	m.mu.Lock()
+	m.boxes[sandboxID] = sb
+	m.mu.Unlock()
+
+	// /init lifecycle bump then start the memory loader. Loader is required
+	// so any future CreateSnapshot taken from this descendant captures all
+	// guest pages (otherwise SEEK_DATA/SEEK_HOLE would emit holes for the
+	// still-lazy UFFD pages — silent corruption across template chains).
+	m.initAndStartMemoryLoader(ctx, sb, defaultUser, id.UUIDString(templateID), defaultEnv)
+
+	m.startSampler(sb)
+	m.startCrashWatcher(sb)
+
+	slog.Info("sandbox launched from snapshot template",
+		"id", sandboxID,
+		"team_id", teamID,
+		"template_id", templateID,
+		"sandbox_dir", meta.SandboxDir,
+		"host_ip", slot.HostIP.String(),
+		"dm_device", dmDev.DevicePath,
+	)
+
+	return &sb.Sandbox, cowSize, nil
+}
+
+// templateExists returns true if a snapshot template already lives at
+// TemplateDir(team, templateID). Used by CreateSnapshot to refuse silent
+// overwrites — every snapshot must land in a fresh templateID.
+func (m *Manager) templateExists(teamID, templateID pgtype.UUID) bool {
+	dir := layout.TemplateDir(m.cfg.WrennDir, teamID, templateID)
+	if _, err := os.Stat(dir); err != nil {
+		return false
+	}
+	return layout.IsSnapshotTemplate(dir)
+}
--- a/internal/sandbox/manager.go
+++ b/internal/sandbox/manager.go
--- a/internal/sandbox/pause.go
+++ b/internal/sandbox/pause.go
--- a/internal/sandbox/proc.go
+++ b/internal/sandbox/proc.go
@ -1,13 +1,14 @@
 package sandbox

 import (
+	"context"
 	"encoding/json"
 	"fmt"
 	"io"
+	"net/http"
 	"os"
 	"strconv"
 	"strings"
-	"syscall"

 	"git.omukk.dev/wrenn/wrenn/internal/envdclient"
 )
@ -48,42 +49,43 @@ func readCPUStat(pid int) (cpuStat, error) {
 	return cpuStat{utime: utime, stime: stime}, nil
 }

-// readEnvdMemUsed fetches mem_used from envd's /metrics endpoint. Returns
-// guest-side total - MemAvailable (actual process memory, excluding reclaimable
-// page cache). VmRSS of the Firecracker process includes guest page cache and
-// never decreases, so this is the accurate metric for dashboard display.
-func readEnvdMemUsed(client *envdclient.Client) (int64, error) {
-	resp, err := client.HTTPClient().Get(client.BaseURL() + "/metrics")
+// envdMetrics holds metric values read from envd's /metrics endpoint.
+type envdMetrics struct {
+	MemBytes  int64
+	DiskBytes int64
+}
+
+// readEnvdMetrics fetches mem_used and disk_used from envd's /metrics endpoint.
+// Returns guest-side process memory (total - available) and filesystem usage
+// from statfs("/"). These are the guest-visible metrics users care about.
+func readEnvdMetrics(ctx context.Context, client *envdclient.Client) (envdMetrics, error) {
+	req, err := http.NewRequestWithContext(ctx, http.MethodGet, client.BaseURL()+"/metrics", nil)
 	if err != nil {
-		return 0, fmt.Errorf("fetch envd metrics: %w", err)
+		return envdMetrics{}, fmt.Errorf("build metrics request: %w", err)
+	}
+
+	resp, err := client.HTTPClient().Do(req)
+	if err != nil {
+		return envdMetrics{}, fmt.Errorf("fetch envd metrics: %w", err)
 	}
 	defer resp.Body.Close()

 	if resp.StatusCode != 200 {
-		return 0, fmt.Errorf("envd metrics: status %d", resp.StatusCode)
+		return envdMetrics{}, fmt.Errorf("envd metrics: status %d", resp.StatusCode)
 	}

 	body, err := io.ReadAll(resp.Body)
 	if err != nil {
-		return 0, fmt.Errorf("read envd metrics body: %w", err)
+		return envdMetrics{}, fmt.Errorf("read envd metrics body: %w", err)
 	}

 	var m struct {
-		MemUsed int64 `json:"mem_used"`
+		MemUsed  int64 `json:"mem_used"`
+		DiskUsed int64 `json:"disk_used"`
 	}
 	if err := json.Unmarshal(body, &m); err != nil {
-		return 0, fmt.Errorf("decode envd metrics: %w", err)
+		return envdMetrics{}, fmt.Errorf("decode envd metrics: %w", err)
 	}

-	return m.MemUsed, nil
-}
-
-// readDiskAllocated returns the actual allocated bytes (not apparent size)
-// of the file at path. This uses stat's block count × 512.
-func readDiskAllocated(path string) (int64, error) {
-	var stat syscall.Stat_t
-	if err := syscall.Stat(path, &stat); err != nil {
-		return 0, fmt.Errorf("stat %s: %w", path, err)
-	}
-	return stat.Blocks * 512, nil
+	return envdMetrics{MemBytes: m.MemUsed, DiskBytes: m.DiskUsed}, nil
 }
--- a/internal/sandbox/punch.go
+++ b/internal/sandbox/punch.go
@ -0,0 +1,186 @@
+// Package sandbox: post-snapshot hole punching for memory-ranges files.
+//
+// CH v52's SEEK_DATA/SEEK_HOLE snapshot writer only skips ranges already
+// hole in the source memfd. Pages the guest never reported as free are
+// written verbatim — including pages whose contents happen to be all zero
+// (fresh allocations the guest scribbled then released without telling the
+// balloon driver). Walking the resulting file and punching any 4 KiB block
+// of zeros recovers that space without any guest cooperation.
+package sandbox
+
+import (
+	"errors"
+	"fmt"
+	"io"
+	"log/slog"
+	"os"
+	"path/filepath"
+	"strings"
+
+	"golang.org/x/sys/unix"
+)
+
+const (
+	// punchBlockSize is the granularity at which we test for zero runs and
+	// issue FALLOC_FL_PUNCH_HOLE. Matches the kernel page size and the
+	// minimum hole size on ext4.
+	punchBlockSize = 4096
+
+	// punchReadSize is the IO chunk size used by the scan loop. We read
+	// many blocks per syscall and split them in-memory so a 20 GiB
+	// memory-ranges file costs ~20K read(2) syscalls instead of ~5M.
+	// Crucial under single-disk hosts where each syscall otherwise
+	// contends with sshd / journal IO.
+	punchReadSize = 1 << 20 // 1 MiB = 256 blocks
+)
+
+// punchZeroPagesInDir runs punchZeroPages on every memory* file in dir.
+// CH writes its memory dump as one or more files prefixed "memory" inside
+// the snapshot directory; everything else (config.json, state.json) is
+// metadata and untouched.
+func punchZeroPagesInDir(dir string) {
+	entries, err := os.ReadDir(dir)
+	if err != nil {
+		slog.Warn("punch: read snapshot dir", "dir", dir, "error", err)
+		return
+	}
+	for _, e := range entries {
+		if e.IsDir() || !strings.HasPrefix(e.Name(), "memory") {
+			continue
+		}
+		path := filepath.Join(dir, e.Name())
+		before, after, err := punchZeroPages(path)
+		if err != nil {
+			slog.Warn("punch: zero-page scan failed", "path", path, "error", err)
+			continue
+		}
+		slog.Info("punch: zero-page scan done",
+			"path", path,
+			"alloc_before", before,
+			"alloc_after", after,
+			"reclaimed", before-after)
+	}
+}
+
+// punchZeroPages scans path block-by-block, batching runs of all-zero 4 KiB
+// blocks and punching them out via FALLOC_FL_PUNCH_HOLE. Existing holes are
+// skipped via SEEK_DATA so a partially-sparse input stays cheap to scan.
+//
+// Returns the file's disk allocation (st_blocks * 512) before and after.
+func punchZeroPages(path string) (int64, int64, error) {
+	f, err := os.OpenFile(path, os.O_RDWR, 0)
+	if err != nil {
+		return 0, 0, err
+	}
+	defer f.Close()
+
+	stBefore, err := statBlocks(f)
+	if err != nil {
+		return 0, 0, fmt.Errorf("stat before: %w", err)
+	}
+
+	fi, err := f.Stat()
+	if err != nil {
+		return 0, 0, fmt.Errorf("stat: %w", err)
+	}
+	size := fi.Size()
+
+	buf := make([]byte, punchReadSize)
+	off := int64(0)
+
+	for off < size {
+		// Skip ahead to next data region; nothing to do in holes.
+		next, err := f.Seek(off, 3) // SEEK_DATA = 3
+		if err != nil {
+			if errors.Is(err, io.EOF) || errors.Is(err, unix.ENXIO) {
+				break
+			}
+			return 0, 0, fmt.Errorf("seek_data @ %d: %w", off, err)
+		}
+		off = next &^ (punchBlockSize - 1) // align down to block
+
+		// Find end of this data extent.
+		endData, err := f.Seek(off, 4) // SEEK_HOLE = 4
+		if err != nil {
+			return 0, 0, fmt.Errorf("seek_hole @ %d: %w", off, err)
+		}
+
+		// Scan [off, endData) chunk by chunk; batch zero runs across both
+		// intra-chunk and inter-chunk boundaries so a contiguous zero
+		// region is punched in a single fallocate.
+		zeroStart := int64(-1)
+		cur := off
+		for cur < endData {
+			toRead := min(int64(len(buf)), endData-cur)
+			n, err := readAt(f, buf[:toRead], cur)
+			if err != nil {
+				return 0, 0, fmt.Errorf("read @ %d: %w", cur, err)
+			}
+			if n == 0 {
+				break
+			}
+			// Walk the chunk one block at a time, tracking zero runs.
+			for blkOff := 0; blkOff < n; blkOff += punchBlockSize {
+				blkEnd := min(blkOff+punchBlockSize, n)
+				blk := buf[blkOff:blkEnd]
+				blkAbs := cur + int64(blkOff)
+				if isZero(blk) && len(blk) == punchBlockSize {
+					if zeroStart < 0 {
+						zeroStart = blkAbs
+					}
+				} else if zeroStart >= 0 {
+					if err := punch(f, zeroStart, blkAbs-zeroStart); err != nil {
+						return 0, 0, err
+					}
+					zeroStart = -1
+				}
+			}
+			cur += int64(n)
+		}
+		if zeroStart >= 0 {
+			if err := punch(f, zeroStart, cur-zeroStart); err != nil {
+				return 0, 0, err
+			}
+		}
+		off = endData
+	}
+
+	stAfter, err := statBlocks(f)
+	if err != nil {
+		return 0, 0, fmt.Errorf("stat after: %w", err)
+	}
+	return stBefore, stAfter, nil
+}
+
+func punch(f *os.File, off, length int64) error {
+	mode := uint32(unix.FALLOC_FL_PUNCH_HOLE | unix.FALLOC_FL_KEEP_SIZE)
+	if err := unix.Fallocate(int(f.Fd()), mode, off, length); err != nil {
+		return fmt.Errorf("fallocate punch @ %d len %d: %w", off, length, err)
+	}
+	return nil
+}
+
+func readAt(f *os.File, buf []byte, off int64) (int, error) {
+	n, err := f.ReadAt(buf, off)
+	if err == io.EOF {
+		return n, nil
+	}
+	return n, err
+}
+
+func isZero(b []byte) bool {
+	for _, x := range b {
+		if x != 0 {
+			return false
+		}
+	}
+	return true
+}
+
+func statBlocks(f *os.File) (int64, error) {
+	var st unix.Stat_t
+	if err := unix.Fstat(int(f.Fd()), &st); err != nil {
+		return 0, err
+	}
+	return int64(st.Blocks) * 512, nil
+}
--- a/internal/sandbox/restore.go
+++ b/internal/sandbox/restore.go
@ -0,0 +1,118 @@
+// Package sandbox: shared CH-restore helpers used by both Resume (paused →
+// running) and the snapshot-template launch path (template → fresh sandbox).
+//
+// The two callers diverge in how they acquire resources (slot, dm-snapshot,
+// sandbox identity) but converge on:
+//
+//	build VMConfig → CreateFromSnapshot → vm.Resume → wait envd → balloon deflate
+//
+// These steps are extracted here so the sequence — and its quirks (paused
+// post-restore state, balloon best-effort, restored disk path baked into
+// CH's config.json) — has a single source of truth.
+package sandbox
+
+import (
+	"context"
+	"fmt"
+	"log/slog"
+	"path/filepath"
+
+	"git.omukk.dev/wrenn/wrenn/internal/envdclient"
+	"git.omukk.dev/wrenn/wrenn/internal/network"
+	"git.omukk.dev/wrenn/wrenn/internal/vm"
+)
+
+// restoreInputs is the common set of fields needed to build a restore VMConfig.
+type restoreInputs struct {
+	sandboxID  string // VM identity for the new CH process (sock path, log file)
+	templateID string // forwarded to envd via PostInit (informational)
+	snapDir    string // directory containing CH snapshot artefacts
+	rootfsPath string // /dev/mapper/wrenn-{newID} — per-sandbox dm-snapshot
+	vcpus      int
+	memoryMB   int
+	slot       *network.Slot
+	sandboxDir string // override for VMConfig.SandboxDir; "" = default
+}
+
+// buildRestoreVMConfig assembles the VMConfig used to launch a CH process in
+// restore mode. sandboxDir, when non-empty, overrides the default
+// "/tmp/ch-vm-{SandboxID}" — required when the snapshot's saved config.json
+// points at a different sandbox's tmpfs path (i.e. snapshot-template launch).
+func (m *Manager) buildRestoreVMConfig(in restoreInputs) vm.VMConfig {
+	return vm.VMConfig{
+		SandboxID:         in.sandboxID,
+		TemplateID:        in.templateID,
+		KernelPath:        m.cfg.KernelPath,
+		RootfsPath:        in.rootfsPath,
+		VCPUs:             in.vcpus,
+		MemoryMB:          in.memoryMB,
+		NetworkNamespace:  in.slot.NamespaceID,
+		TapDevice:         in.slot.TapName,
+		TapMAC:            in.slot.TapMAC,
+		GuestIP:           in.slot.GuestIP,
+		GatewayIP:         in.slot.TapIP,
+		NetMask:           in.slot.GuestNetMask,
+		VMMBin:            m.cfg.VMMBin,
+		LogDir:            filepath.Join(m.cfg.WrennDir, "logs"),
+		RestoreFromDir:    in.snapDir,
+		RestoreLazyMemory: true,
+		SandboxDir:        in.sandboxDir,
+	}
+}
+
+// launchRestoredVM starts CH in restore mode, resumes the vCPUs, waits for
+// envd to be reachable, then best-effort deflates the balloon. On any failure
+// the partial VM is destroyed before returning — the caller is responsible
+// for tearing down dm/network/slot.
+//
+// Returns the connected envd client on success.
+func (m *Manager) launchRestoredVM(ctx context.Context, vmCfg vm.VMConfig, hostIP string) (*envdclient.Client, error) {
+	if _, err := m.vm.CreateFromSnapshot(ctx, vmCfg); err != nil {
+		return nil, fmt.Errorf("create from snapshot: %w", err)
+	}
+
+	if err := m.vm.Resume(ctx, vmCfg.SandboxID); err != nil {
+		_ = m.vm.Destroy(context.Background(), vmCfg.SandboxID)
+		return nil, fmt.Errorf("vm resume: %w", err)
+	}
+
+	client := envdclient.New(hostIP)
+	waitCtx, waitCancel := context.WithTimeout(ctx, envdReadyTimeout(vmCfg.MemoryMB))
+	defer waitCancel()
+	if err := client.WaitUntilReady(waitCtx); err != nil {
+		_ = m.vm.Destroy(context.Background(), vmCfg.SandboxID)
+		return nil, fmt.Errorf("wait envd: %w", err)
+	}
+
+	// Best-effort balloon deflate. Free-page reporting drains pages while the
+	// sandbox runs; the resumed guest needs its full memory budget back. A
+	// failure leaves the guest memory-starved but doesn't break correctness.
+	if err := m.vm.UpdateBalloon(ctx, vmCfg.SandboxID, 0); err != nil {
+		slog.Warn("balloon deflate after restore failed", "id", vmCfg.SandboxID, "error", err)
+	}
+
+	return client, nil
+}
+
+// initAndStartMemoryLoader runs envd's /init lifecycle bump and then kicks
+// off the background memory loader. Ordering matters: /init resets envd's
+// mem_preload_* atomics, so the loader's POST /memory/preload must land
+// after — otherwise the next CreateSnapshot/Pause would observe a stale
+// "idle" state and snapshot a memfile full of holes.
+//
+// Must be called with sb already registered in m.boxes with StatusRunning
+// and sb.client populated.
+func (m *Manager) initAndStartMemoryLoader(ctx context.Context, sb *sandboxState, defaultUser, templateIDStr string, envVars map[string]string) {
+	initCtx, initCancel := context.WithTimeout(ctx, m.cfg.EnvdTimeout)
+	defer initCancel()
+	c := sb.client.Load()
+	if c == nil {
+		slog.Warn("post-restore PostInit skipped: envd client cleared", "id", sb.ID)
+		return
+	}
+	if err := c.PostInitWithDefaults(initCtx, defaultUser, envVars, sb.ID, templateIDStr); err != nil {
+		slog.Warn("post-restore PostInit failed", "id", sb.ID, "error", err)
+	}
+
+	m.startMemoryLoader(sb)
+}
--- a/internal/sandbox/restore_paused.go
+++ b/internal/sandbox/restore_paused.go
@ -0,0 +1,208 @@
+package sandbox
+
+import (
+	"fmt"
+	"log/slog"
+	"os"
+	"path/filepath"
+	"sort"
+	"strings"
+	"time"
+
+	"github.com/google/uuid"
+
+	"git.omukk.dev/wrenn/wrenn/internal/layout"
+	"git.omukk.dev/wrenn/wrenn/internal/models"
+)
+
+// RestorePausedSandboxes scans WRENN_DIR/sandboxes/ for paused-sandbox
+// snapshots left behind by a previous agent instance and re-registers them
+// in m.boxes as StatusPaused. Without this, ListSandboxes would not report
+// these sandboxes, and the CP's HostMonitor would mark them stopped via
+// the missing-confirmed-dead reconcile path — orphaning the on-disk
+// snapshot dir and surfacing a leaked "stopped" sandbox to users.
+//
+// Restored sandboxes hold ONLY the slot reservation; VM / network / dm /
+// loop refcount stay unowned until Resume rebuilds them. baseImagePath is
+// deliberately NOT set on the in-memory entry so cleanup() does not call
+// loops.Release on a loop that was never Acquire'd — the registry tolerates
+// a Release of an unknown key, but a coincident-same-base running sandbox
+// would have its refcount decremented incorrectly.
+//
+// Must be called once at agent startup, AFTER CleanupOrphanPauseDirs (so
+// .staging-* / .trash-* dirs are gone) and BEFORE the HTTP server starts
+// serving — otherwise an early Create RPC can race the slot reservation.
+//
+// Corrupt snapshot dirs (unparseable meta, missing slot index) are renamed
+// to .trash-{ts}/ so a future CleanupOrphanPauseDirs sweeps them. Soft
+// errors are logged; this function never returns an error — startup should
+// not fail because a single sandbox is unrecoverable.
+func (m *Manager) RestorePausedSandboxes() {
+	sandboxesDir := layout.SandboxesDir(m.cfg.WrennDir)
+	entries, err := os.ReadDir(sandboxesDir)
+	if err != nil {
+		// Directory does not exist yet — fresh install, nothing to restore.
+		return
+	}
+
+	type candidate struct {
+		sandboxID string
+		snapDir   string
+		meta      *snapshotMeta
+		teamID    [16]byte
+		templID   [16]byte
+	}
+
+	// Pass 1: parse every snapshot meta. Trash anything unreadable or
+	// missing the slot index — those are crash artefacts, not recoverable
+	// sandboxes.
+	candidates := make([]candidate, 0, len(entries))
+	for _, e := range entries {
+		if !e.IsDir() {
+			continue
+		}
+		name := e.Name()
+		// Skip CleanupOrphanPauseDirs's territory. If it ran before us
+		// these are already gone; if not, leave them alone.
+		if strings.Contains(name, ".staging-") || strings.Contains(name, ".trash-") {
+			continue
+		}
+
+		snapDir := layout.PauseSnapshotDir(m.cfg.WrennDir, name)
+		meta, err := readSnapshotMeta(snapDir)
+		if err != nil {
+			slog.Warn("restore: unreadable snapshot meta, trashing dir",
+				"id", name, "error", err)
+			trashCorruptDir(snapDir)
+			continue
+		}
+		if meta.SlotIndex == 0 {
+			slog.Warn("restore: snapshot has no slot_index, trashing dir", "id", name)
+			trashCorruptDir(snapDir)
+			continue
+		}
+		teamBytes, err := parsePlainUUID(meta.TeamID)
+		if err != nil {
+			slog.Warn("restore: bad team_id in snapshot meta", "id", name, "error", err)
+			trashCorruptDir(snapDir)
+			continue
+		}
+		templateBytes, err := parsePlainUUID(meta.TemplateID)
+		if err != nil {
+			slog.Warn("restore: bad template_id in snapshot meta", "id", name, "error", err)
+			trashCorruptDir(snapDir)
+			continue
+		}
+		candidates = append(candidates, candidate{
+			sandboxID: name,
+			snapDir:   snapDir,
+			meta:      meta,
+			teamID:    teamBytes,
+			templID:   templateBytes,
+		})
+	}
+
+	// Pass 2: bucket by slot index, pick the newest CreatedAt per slot.
+	// Multiple candidates per slot happen when older paused-sandbox dirs
+	// were left on disk by the pre-fix leak (DB row marked stopped but the
+	// snapshot was never cleaned). The newest is the most likely live one;
+	// older losers are trashed so CleanupOrphanPauseDirs sweeps them on
+	// the next startup.
+	bySlot := make(map[int][]candidate, len(candidates))
+	for _, c := range candidates {
+		bySlot[c.meta.SlotIndex] = append(bySlot[c.meta.SlotIndex], c)
+	}
+
+	restored := 0
+	pruned := 0
+	for slot, cands := range bySlot {
+		sort.Slice(cands, func(i, j int) bool {
+			return cands[i].meta.CreatedAt.After(cands[j].meta.CreatedAt)
+		})
+
+		// Trash every loser. The host_monitor's zombie-cleanup path catches
+		// the winner if its DB row says 'stopped' — but losers never enter
+		// m.boxes and would otherwise sit on disk indefinitely.
+		for _, stale := range cands[1:] {
+			slog.Info("restore: pruning older snapshot for same slot",
+				"id", stale.sandboxID, "slot", slot, "created", stale.meta.CreatedAt,
+				"winner", cands[0].sandboxID, "winner_created", cands[0].meta.CreatedAt)
+			trashCorruptDir(stale.snapDir)
+			pruned++
+		}
+
+		winner := cands[0]
+		if err := m.slots.Reserve(winner.meta.SlotIndex); err != nil {
+			// Reserve only fails if another candidate (different slot value
+			// in meta but same numeric index) already grabbed it, or if the
+			// allocator is corrupt. Either way the snapshot is unusable
+			// without a slot, so trash it.
+			slog.Warn("restore: slot reservation failed, trashing dir",
+				"id", winner.sandboxID, "slot", winner.meta.SlotIndex, "error", err)
+			trashCorruptDir(winner.snapDir)
+			pruned++
+			continue
+		}
+
+		sb := &sandboxState{
+			Sandbox: models.Sandbox{
+				ID:             winner.sandboxID,
+				Status:         models.StatusPaused,
+				TemplateTeamID: winner.teamID,
+				TemplateID:     winner.templID,
+				VCPUs:          winner.meta.VCPUs,
+				MemoryMB:       winner.meta.MemoryMB,
+				TimeoutSec:     winner.meta.TimeoutSec,
+				SlotIndex:      winner.meta.SlotIndex,
+				CreatedAt:      winner.meta.CreatedAt,
+				// LastActiveAt cosmetic only — TTL reaper ignores non-Running.
+				LastActiveAt: winner.meta.CreatedAt,
+			},
+			// connTracker must be non-nil: resumeFromMeta calls Reset() on it
+			// unconditionally during rehydration. A nil pointer would panic.
+			connTracker: &ConnTracker{},
+			// baseImagePath intentionally left empty — see function doc.
+			// sandboxDirOverride intentionally left empty — resumeFromMeta
+			// reads meta.SandboxDir from disk on the resume path.
+		}
+
+		m.mu.Lock()
+		m.boxes[winner.sandboxID] = sb
+		m.mu.Unlock()
+		restored++
+
+		slog.Info("restored paused sandbox", "id", winner.sandboxID,
+			"slot", winner.meta.SlotIndex, "vcpus", winner.meta.VCPUs, "memory_mb", winner.meta.MemoryMB)
+	}
+
+	if restored > 0 || pruned > 0 {
+		slog.Info("paused sandbox restore complete", "restored", restored, "pruned", pruned)
+	}
+}
+
+// parsePlainUUID turns a standard hyphenated UUID string (as produced by
+// id.UUIDString) back into the 16-byte representation used by sandboxState.
+func parsePlainUUID(s string) ([16]byte, error) {
+	if s == "" {
+		return [16]byte{}, fmt.Errorf("empty uuid string")
+	}
+	u, err := uuid.Parse(s)
+	if err != nil {
+		return [16]byte{}, err
+	}
+	return [16]byte(u), nil
+}
+
+// trashCorruptDir renames a corrupt snapshot directory aside so a future
+// CleanupOrphanPauseDirs sweeps it. Best-effort: if rename fails we log
+// and move on — leaving the directory in place is safe (restore will skip
+// it again next startup) but unwanted.
+func trashCorruptDir(dir string) {
+	parent := filepath.Dir(dir)
+	base := filepath.Base(dir)
+	trash := filepath.Join(parent, fmt.Sprintf("%s.trash-%d", base, time.Now().UnixNano()))
+	if err := os.Rename(dir, trash); err != nil {
+		slog.Warn("restore: failed to trash corrupt snapshot dir",
+			"src", dir, "dst", trash, "error", err)
+	}
+}