Add device-mapper snapshots, test UI, fix pause ordering and lint errors

- Replace reflink rootfs copy with device-mapper snapshots (shared read-only loop device per base template, per-sandbox sparse CoW file) - Add devicemapper package with create/restore/remove/flatten operations and refcounted LoopRegistry for base image loop devices - Fix pause ordering: destroy VM before removing dm-snapshot to avoid "device busy" error (FC must release the dm device first) - Add test UI at GET /test for sandbox lifecycle management (create, pause, resume, destroy, exec, snapshot create/list/delete) - Fix DirSize to report actual disk usage (stat.Blocks * 512) instead of apparent size, so sparse CoW files report correctly - Add timing logs to pause flow for performance diagnostics - Fix all lint errors across api, network, vm, uffd, and sandbox packages - Remove obsolete internal/filesystem package (replaced by devicemapper) - Update CLAUDE.md with device-mapper architecture documentation
2026-03-13 08:25:40 +06:00
parent 778894b488
commit 63e9132d38
23 changed files with 1202 additions and 155 deletions
--- a/internal/api/handlers_exec.go
+++ b/internal/api/handlers_exec.go
@ -3,6 +3,7 @@ package api
 import (
 	"encoding/base64"
 	"encoding/json"
+	"log/slog"
 	"net/http"
 	"time"
 	"unicode/utf8"
@ -85,13 +86,15 @@ func (h *execHandler) Exec(w http.ResponseWriter, r *http.Request) {
 	duration := time.Since(start)

 	// Update last active.
-	h.db.UpdateLastActive(ctx, db.UpdateLastActiveParams{
+	if err := h.db.UpdateLastActive(ctx, db.UpdateLastActiveParams{
 		ID: sandboxID,
 		LastActiveAt: pgtype.Timestamptz{
 			Time:  time.Now(),
 			Valid: true,
 		},
-	})
+	}); err != nil {
+		slog.Warn("failed to update last_active_at", "id", sandboxID, "error", err)
+	}

 	// Use base64 encoding if output contains non-UTF-8 bytes.
 	stdout := resp.Msg.Stdout
--- a/internal/api/handlers_exec_stream.go
+++ b/internal/api/handlers_exec_stream.go
@ -37,11 +37,6 @@ type wsStartMsg struct {
 	Args []string `json:"args"`
 }

-// wsStopMsg is sent by the client to stop the process.
-type wsStopMsg struct {
-	Type string `json:"type"` // "stop"
-}
-
 // wsOutMsg is sent by the server for process events.
 type wsOutMsg struct {
 	Type     string `json:"type"`                // "start", "stdout", "stderr", "exit", "error"
--- a/internal/api/handlers_files.go
+++ b/internal/api/handlers_files.go
@ -128,5 +128,5 @@ func (h *filesHandler) Download(w http.ResponseWriter, r *http.Request) {
 	}

 	w.Header().Set("Content-Type", "application/octet-stream")
-	w.Write(resp.Msg.Content)
+	_, _ = w.Write(resp.Msg.Content)
 }
--- a/internal/api/handlers_files_stream.go
+++ b/internal/api/handlers_files_stream.go
@ -2,6 +2,7 @@ package api

 import (
 	"io"
+	"log/slog"
 	"mime"
 	"mime/multipart"
 	"net/http"
@ -189,6 +190,6 @@ func (h *filesStreamHandler) StreamDownload(w http.ResponseWriter, r *http.Reque

 	if err := stream.Err(); err != nil {
 		// Headers already sent, nothing we can do but log.
-		// The client will see a truncated response.
+		slog.Warn("file stream error after headers sent", "error", err)
 	}
 }
--- a/internal/api/handlers_sandbox.go
+++ b/internal/api/handlers_sandbox.go
@ -111,7 +111,7 @@ func (h *sandboxHandler) Create(w http.ResponseWriter, r *http.Request) {
 	sandboxID := id.NewSandboxID()

 	// Insert pending record.
-	sb, err := h.db.InsertSandbox(ctx, db.InsertSandboxParams{
+	_, err := h.db.InsertSandbox(ctx, db.InsertSandboxParams{
 		ID:         sandboxID,
 		OwnerID:    "",
 		HostID:     "default",
@ -136,9 +136,11 @@ func (h *sandboxHandler) Create(w http.ResponseWriter, r *http.Request) {
 		TimeoutSec: req.TimeoutSec,
 	}))
 	if err != nil {
-		h.db.UpdateSandboxStatus(ctx, db.UpdateSandboxStatusParams{
+		if _, dbErr := h.db.UpdateSandboxStatus(ctx, db.UpdateSandboxStatusParams{
 			ID: sandboxID, Status: "error",
-		})
+		}); dbErr != nil {
+			slog.Warn("failed to update sandbox status to error", "id", sandboxID, "error", dbErr)
+		}
 		status, code, msg := agentErrToHTTP(err)
 		writeError(w, status, code, msg)
 		return
@ -146,7 +148,7 @@ func (h *sandboxHandler) Create(w http.ResponseWriter, r *http.Request) {

 	// Update to running.
 	now := time.Now()
-	sb, err = h.db.UpdateSandboxRunning(ctx, db.UpdateSandboxRunningParams{
+	sb, err := h.db.UpdateSandboxRunning(ctx, db.UpdateSandboxRunningParams{
 		ID:      sandboxID,
 		HostIp:  resp.Msg.HostIp,
 		GuestIp: "",
--- a/internal/api/handlers_snapshots.go
+++ b/internal/api/handlers_snapshots.go
@ -84,7 +84,9 @@ func (h *snapshotHandler) Create(w http.ResponseWriter, r *http.Request) {
 			return
 		}
 		// Delete existing template record and files.
-		h.db.DeleteTemplate(ctx, req.Name)
+		if err := h.db.DeleteTemplate(ctx, req.Name); err != nil {
+			slog.Warn("failed to delete existing template", "name", req.Name, "error", err)
+		}
 	}

 	// Verify sandbox exists and is running or paused.
--- a/internal/api/handlers_test_ui.go
+++ b/internal/api/handlers_test_ui.go
@ -0,0 +1,435 @@
+package api
+
+import (
+	"fmt"
+	"net/http"
+)
+
+func serveTestUI(w http.ResponseWriter, r *http.Request) {
+	w.Header().Set("Content-Type", "text/html; charset=utf-8")
+	fmt.Fprint(w, testUIHTML)
+}
+
+const testUIHTML = `<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="UTF-8">
+<meta name="viewport" content="width=device-width, initial-scale=1.0">
+<title>Wrenn Sandbox — Test Console</title>
+<style>
+  * { box-sizing: border-box; margin: 0; padding: 0; }
+  body {
+    font-family: 'Menlo', 'Consolas', 'JetBrains Mono', monospace;
+    font-size: 13px;
+    background: #0f1211;
+    color: #c8c4bc;
+    padding: 16px;
+  }
+  h1 { font-size: 18px; color: #e8e5df; margin-bottom: 12px; }
+  h2 { font-size: 14px; color: #89a785; margin: 16px 0 8px; border-bottom: 1px solid #262c2a; padding-bottom: 4px; }
+  .grid { display: grid; grid-template-columns: 1fr 1fr; gap: 16px; }
+  .panel {
+    background: #151918;
+    border: 1px solid #262c2a;
+    border-radius: 8px;
+    padding: 12px;
+  }
+  .full { grid-column: 1 / -1; }
+  label { display: block; color: #8a867f; margin: 6px 0 2px; font-size: 11px; text-transform: uppercase; letter-spacing: 0.05em; }
+  input, select {
+    width: 100%;
+    background: #1b201e;
+    border: 1px solid #262c2a;
+    color: #e8e5df;
+    padding: 6px 8px;
+    border-radius: 4px;
+    font-family: inherit;
+    font-size: 13px;
+  }
+  input:focus, select:focus { outline: none; border-color: #5e8c58; }
+  .btn-row { display: flex; gap: 6px; margin-top: 8px; flex-wrap: wrap; }
+  button {
+    padding: 6px 14px;
+    border: 1px solid #262c2a;
+    border-radius: 4px;
+    font-family: inherit;
+    font-size: 12px;
+    cursor: pointer;
+    background: #1b201e;
+    color: #c8c4bc;
+    transition: all 0.15s;
+  }
+  button:hover { border-color: #5e8c58; color: #e8e5df; }
+  .btn-green { background: #2a3d28; border-color: #5e8c58; color: #89a785; }
+  .btn-green:hover { background: #3a5035; }
+  .btn-red { background: #3d2828; border-color: #b35544; color: #c27b6d; }
+  .btn-red:hover { background: #4d3030; }
+  .btn-amber { background: #3d3428; border-color: #9e7c2e; color: #c8a84e; }
+  .btn-amber:hover { background: #4d4030; }
+  .btn-blue { background: #28343d; border-color: #3d7aac; color: #6da0cc; }
+  .btn-blue:hover { background: #304050; }
+  table { width: 100%; border-collapse: collapse; margin-top: 8px; }
+  th { text-align: left; font-size: 11px; color: #8a867f; text-transform: uppercase; letter-spacing: 0.05em; padding: 4px 8px; border-bottom: 1px solid #262c2a; }
+  td { padding: 6px 8px; border-bottom: 1px solid #1b201e; }
+  tr:hover td { background: #1b201e; }
+  .status { display: inline-block; padding: 2px 8px; border-radius: 10px; font-size: 11px; font-weight: 600; }
+  .status-running { background: rgba(94,140,88,0.15); color: #89a785; }
+  .status-paused { background: rgba(158,124,46,0.15); color: #c8a84e; }
+  .status-pending { background: rgba(61,122,172,0.15); color: #6da0cc; }
+  .status-stopped { background: rgba(138,134,127,0.15); color: #8a867f; }
+  .status-error { background: rgba(179,85,68,0.15); color: #c27b6d; }
+  .status-hibernated { background: rgba(61,122,172,0.15); color: #6da0cc; }
+  .log {
+    background: #0f1211;
+    border: 1px solid #262c2a;
+    border-radius: 4px;
+    padding: 8px;
+    max-height: 300px;
+    overflow-y: auto;
+    margin-top: 8px;
+    font-size: 12px;
+    white-space: pre-wrap;
+    word-break: break-all;
+  }
+  .log-entry { margin-bottom: 4px; }
+  .log-time { color: #5f5c57; }
+  .log-ok { color: #89a785; }
+  .log-err { color: #c27b6d; }
+  .log-info { color: #6da0cc; }
+  .exec-output {
+    background: #0f1211;
+    border: 1px solid #262c2a;
+    border-radius: 4px;
+    padding: 8px;
+    max-height: 200px;
+    overflow-y: auto;
+    margin-top: 8px;
+    font-size: 12px;
+    white-space: pre-wrap;
+  }
+  .clickable { cursor: pointer; color: #89a785; text-decoration: underline; }
+  .clickable:hover { color: #aacdaa; }
+</style>
+</head>
+<body>
+
+<h1>Wrenn Sandbox Test Console</h1>
+
+<div class="grid">
+  <!-- Create Sandbox -->
+  <div class="panel">
+    <h2>Create Sandbox</h2>
+    <label>Template</label>
+    <input type="text" id="create-template" value="minimal" placeholder="minimal or snapshot name">
+    <label>vCPUs</label>
+    <input type="number" id="create-vcpus" value="1" min="1" max="8">
+    <label>Memory (MB)</label>
+    <input type="number" id="create-memory" value="512" min="128" max="8192">
+    <label>Timeout (sec)</label>
+    <input type="number" id="create-timeout" value="300" min="30">
+    <div class="btn-row">
+      <button class="btn-green" onclick="createSandbox()">Create</button>
+    </div>
+  </div>
+
+  <!-- Snapshot Management -->
+  <div class="panel">
+    <h2>Create Snapshot</h2>
+    <label>Sandbox ID</label>
+    <input type="text" id="snap-sandbox-id" placeholder="sb-xxxxxxxx">
+    <label>Snapshot Name (optional)</label>
+    <input type="text" id="snap-name" placeholder="auto-generated if empty">
+    <div class="btn-row">
+      <button class="btn-amber" onclick="createSnapshot()">Create Snapshot</button>
+      <label style="display:inline-flex;align-items:center;margin:0;font-size:12px;text-transform:none;letter-spacing:0">
+        <input type="checkbox" id="snap-overwrite" style="width:auto;margin-right:4px"> Overwrite
+      </label>
+    </div>
+
+    <h2>Snapshots / Templates</h2>
+    <div class="btn-row">
+      <button onclick="listSnapshots()">Refresh</button>
+    </div>
+    <div id="snapshots-table"></div>
+  </div>
+
+  <!-- Execute Command -->
+  <div class="panel">
+    <h2>Execute Command</h2>
+    <label>Sandbox ID</label>
+    <input type="text" id="exec-sandbox-id" placeholder="sb-xxxxxxxx">
+    <label>Command</label>
+    <input type="text" id="exec-cmd" value="/bin/sh" placeholder="/bin/sh">
+    <label>Args (comma separated)</label>
+    <input type="text" id="exec-args" value="-c,uname -a" placeholder="-c,echo hello">
+    <div class="btn-row">
+      <button class="btn-green" onclick="execCmd()">Run</button>
+    </div>
+    <div id="exec-output" class="exec-output" style="display:none"></div>
+  </div>
+
+  <!-- Activity Log -->
+  <div class="panel">
+    <h2>Activity Log</h2>
+    <div id="log" class="log"></div>
+  </div>
+
+  <!-- Sandboxes List -->
+  <div class="panel full">
+    <h2>Sandboxes</h2>
+    <div class="btn-row">
+      <button onclick="listSandboxes()">Refresh</button>
+      <label style="display:inline-flex;align-items:center;margin:0;font-size:12px;text-transform:none;letter-spacing:0">
+        <input type="checkbox" id="auto-refresh" style="width:auto;margin-right:4px"> Auto-refresh (5s)
+      </label>
+    </div>
+    <div id="sandboxes-table"></div>
+  </div>
+</div>
+
+<script>
+const API = '';
+
+function log(msg, level) {
+  const el = document.getElementById('log');
+  const t = new Date().toLocaleTimeString();
+  const cls = level === 'ok' ? 'log-ok' : level === 'err' ? 'log-err' : 'log-info';
+  el.innerHTML = '<div class="log-entry"><span class="log-time">' + t + '</span> <span class="' + cls + '">' + esc(msg) + '</span></div>' + el.innerHTML;
+}
+
+function esc(s) {
+  const d = document.createElement('div');
+  d.textContent = s;
+  return d.innerHTML;
+}
+
+async function api(method, path, body) {
+  const opts = { method, headers: {} };
+  if (body) {
+    opts.headers['Content-Type'] = 'application/json';
+    opts.body = JSON.stringify(body);
+  }
+  const resp = await fetch(API + path, opts);
+  if (resp.status === 204) return null;
+  const data = await resp.json();
+  if (resp.status >= 300) {
+    throw new Error(data.error ? data.error.message : resp.statusText);
+  }
+  return data;
+}
+
+function statusBadge(s) {
+  return '<span class="status status-' + s + '">' + s + '</span>';
+}
+
+// --- Sandboxes ---
+
+async function listSandboxes() {
+  try {
+    const data = await api('GET', '/v1/sandboxes');
+    renderSandboxes(data);
+  } catch (e) {
+    log('List sandboxes failed: ' + e.message, 'err');
+  }
+}
+
+function renderSandboxes(sandboxes) {
+  if (!sandboxes || sandboxes.length === 0) {
+    document.getElementById('sandboxes-table').innerHTML = '<p style="color:#5f5c57;margin-top:8px">No sandboxes</p>';
+    return;
+  }
+  let html = '<table><thead><tr><th>ID</th><th>Status</th><th>Template</th><th>vCPUs</th><th>Mem</th><th>Host IP</th><th>Created</th><th>Actions</th></tr></thead><tbody>';
+  for (const sb of sandboxes) {
+    html += '<tr>';
+    html += '<td class="clickable" onclick="useSandbox(\'' + sb.id + '\')">' + sb.id + '</td>';
+    html += '<td>' + statusBadge(sb.status) + '</td>';
+    html += '<td>' + esc(sb.template) + '</td>';
+    html += '<td>' + sb.vcpus + '</td>';
+    html += '<td>' + sb.memory_mb + 'MB</td>';
+    html += '<td>' + (sb.host_ip || '-') + '</td>';
+    html += '<td>' + new Date(sb.created_at).toLocaleTimeString() + '</td>';
+    html += '<td><div class="btn-row">';
+    if (sb.status === 'running') {
+      html += '<button class="btn-amber" onclick="pauseSandbox(\'' + sb.id + '\')">Pause</button>';
+      html += '<button class="btn-red" onclick="destroySandbox(\'' + sb.id + '\')">Destroy</button>';
+    } else if (sb.status === 'paused') {
+      html += '<button class="btn-green" onclick="resumeSandbox(\'' + sb.id + '\')">Resume</button>';
+      html += '<button class="btn-red" onclick="destroySandbox(\'' + sb.id + '\')">Destroy</button>';
+    } else {
+      html += '<button class="btn-red" onclick="destroySandbox(\'' + sb.id + '\')">Destroy</button>';
+    }
+    html += '</div></td>';
+    html += '</tr>';
+  }
+  html += '</tbody></table>';
+  document.getElementById('sandboxes-table').innerHTML = html;
+}
+
+function useSandbox(id) {
+  document.getElementById('exec-sandbox-id').value = id;
+  document.getElementById('snap-sandbox-id').value = id;
+}
+
+async function createSandbox() {
+  const template = document.getElementById('create-template').value;
+  const vcpus = parseInt(document.getElementById('create-vcpus').value);
+  const memory_mb = parseInt(document.getElementById('create-memory').value);
+  const timeout_sec = parseInt(document.getElementById('create-timeout').value);
+  log('Creating sandbox (template=' + template + ', vcpus=' + vcpus + ', mem=' + memory_mb + 'MB)...', 'info');
+  try {
+    const data = await api('POST', '/v1/sandboxes', { template, vcpus, memory_mb, timeout_sec });
+    log('Created sandbox ' + data.id + ' [' + data.status + ']', 'ok');
+    listSandboxes();
+  } catch (e) {
+    log('Create failed: ' + e.message, 'err');
+  }
+}
+
+async function pauseSandbox(id) {
+  log('Pausing ' + id + '...', 'info');
+  try {
+    await api('POST', '/v1/sandboxes/' + id + '/pause');
+    log('Paused ' + id, 'ok');
+    listSandboxes();
+  } catch (e) {
+    log('Pause failed: ' + e.message, 'err');
+  }
+}
+
+async function resumeSandbox(id) {
+  log('Resuming ' + id + '...', 'info');
+  try {
+    await api('POST', '/v1/sandboxes/' + id + '/resume');
+    log('Resumed ' + id, 'ok');
+    listSandboxes();
+  } catch (e) {
+    log('Resume failed: ' + e.message, 'err');
+  }
+}
+
+async function destroySandbox(id) {
+  log('Destroying ' + id + '...', 'info');
+  try {
+    await api('DELETE', '/v1/sandboxes/' + id);
+    log('Destroyed ' + id, 'ok');
+    listSandboxes();
+  } catch (e) {
+    log('Destroy failed: ' + e.message, 'err');
+  }
+}
+
+// --- Exec ---
+
+async function execCmd() {
+  const sandboxId = document.getElementById('exec-sandbox-id').value;
+  const cmd = document.getElementById('exec-cmd').value;
+  const argsStr = document.getElementById('exec-args').value;
+  const args = argsStr ? argsStr.split(',').map(s => s.trim()) : [];
+
+  if (!sandboxId) { log('No sandbox ID for exec', 'err'); return; }
+
+  const out = document.getElementById('exec-output');
+  out.style.display = 'block';
+  out.textContent = 'Running...';
+
+  log('Exec on ' + sandboxId + ': ' + cmd + ' ' + args.join(' '), 'info');
+  try {
+    const data = await api('POST', '/v1/sandboxes/' + sandboxId + '/exec', { cmd, args });
+    let text = '';
+    if (data.stdout) text += data.stdout;
+    if (data.stderr) text += '\n[stderr]\n' + data.stderr;
+    text += '\n[exit_code=' + data.exit_code + ', duration=' + data.duration_ms + 'ms]';
+    out.textContent = text;
+    log('Exec completed (exit=' + data.exit_code + ')', data.exit_code === 0 ? 'ok' : 'err');
+  } catch (e) {
+    out.textContent = 'Error: ' + e.message;
+    log('Exec failed: ' + e.message, 'err');
+  }
+}
+
+// --- Snapshots ---
+
+async function createSnapshot() {
+  const sandbox_id = document.getElementById('snap-sandbox-id').value;
+  const name = document.getElementById('snap-name').value;
+  const overwrite = document.getElementById('snap-overwrite').checked;
+
+  if (!sandbox_id) { log('No sandbox ID for snapshot', 'err'); return; }
+
+  const body = { sandbox_id };
+  if (name) body.name = name;
+
+  const qs = overwrite ? '?overwrite=true' : '';
+  log('Creating snapshot from ' + sandbox_id + (name ? ' as "' + name + '"' : '') + '...', 'info');
+  try {
+    const data = await api('POST', '/v1/snapshots' + qs, body);
+    log('Snapshot created: ' + data.name + ' (' + (data.size_bytes / 1024 / 1024).toFixed(1) + 'MB)', 'ok');
+    listSnapshots();
+    listSandboxes();
+  } catch (e) {
+    log('Snapshot failed: ' + e.message, 'err');
+  }
+}
+
+async function listSnapshots() {
+  try {
+    const data = await api('GET', '/v1/snapshots');
+    renderSnapshots(data);
+  } catch (e) {
+    log('List snapshots failed: ' + e.message, 'err');
+  }
+}
+
+function renderSnapshots(snapshots) {
+  if (!snapshots || snapshots.length === 0) {
+    document.getElementById('snapshots-table').innerHTML = '<p style="color:#5f5c57;margin-top:8px">No snapshots</p>';
+    return;
+  }
+  let html = '<table><thead><tr><th>Name</th><th>Type</th><th>vCPUs</th><th>Mem</th><th>Size</th><th>Actions</th></tr></thead><tbody>';
+  for (const s of snapshots) {
+    html += '<tr>';
+    html += '<td class="clickable" onclick="useTemplate(\'' + esc(s.name) + '\')">' + esc(s.name) + '</td>';
+    html += '<td>' + s.type + '</td>';
+    html += '<td>' + (s.vcpus || '-') + '</td>';
+    html += '<td>' + (s.memory_mb ? s.memory_mb + 'MB' : '-') + '</td>';
+    html += '<td>' + (s.size_bytes / 1024 / 1024).toFixed(1) + 'MB</td>';
+    html += '<td><button class="btn-red" onclick="deleteSnapshot(\'' + esc(s.name) + '\')">Delete</button></td>';
+    html += '</tr>';
+  }
+  html += '</tbody></table>';
+  document.getElementById('snapshots-table').innerHTML = html;
+}
+
+function useTemplate(name) {
+  document.getElementById('create-template').value = name;
+  log('Template set to "' + name + '" — click Create to launch from this snapshot', 'info');
+}
+
+async function deleteSnapshot(name) {
+  log('Deleting snapshot "' + name + '"...', 'info');
+  try {
+    await api('DELETE', '/v1/snapshots/' + encodeURIComponent(name));
+    log('Deleted snapshot "' + name + '"', 'ok');
+    listSnapshots();
+  } catch (e) {
+    log('Delete snapshot failed: ' + e.message, 'err');
+  }
+}
+
+// --- Auto-refresh ---
+let refreshInterval = null;
+document.getElementById('auto-refresh').addEventListener('change', function() {
+  if (this.checked) {
+    refreshInterval = setInterval(listSandboxes, 5000);
+  } else {
+    clearInterval(refreshInterval);
+    refreshInterval = null;
+  }
+});
+
+// --- Init ---
+listSandboxes();
+listSnapshots();
+</script>
+</body>
+</html>`
--- a/internal/api/middleware.go
+++ b/internal/api/middleware.go
@ -24,7 +24,7 @@ type errorDetail struct {
 func writeJSON(w http.ResponseWriter, status int, v any) {
 	w.Header().Set("Content-Type", "application/json")
 	w.WriteHeader(status)
-	json.NewEncoder(w).Encode(v)
+	_ = json.NewEncoder(w).Encode(v)
 }

 func writeError(w http.ResponseWriter, status int, code, message string) {
--- a/internal/api/server.go
+++ b/internal/api/server.go
@ -35,6 +35,9 @@ func New(queries *db.Queries, agent hostagentv1connect.HostAgentServiceClient) *
 	r.Get("/openapi.yaml", serveOpenAPI)
 	r.Get("/docs", serveDocs)

+	// Test UI for sandbox lifecycle management.
+	r.Get("/test", serveTestUI)
+
 	// Sandbox CRUD.
 	r.Route("/v1/sandboxes", func(r chi.Router) {
 		r.Post("/", sandbox.Create)
@ -71,7 +74,7 @@ func (s *Server) Handler() http.Handler {

 func serveOpenAPI(w http.ResponseWriter, r *http.Request) {
 	w.Header().Set("Content-Type", "application/yaml")
-	w.Write(openapiYAML)
+	_, _ = w.Write(openapiYAML)
 }

 func serveDocs(w http.ResponseWriter, r *http.Request) {
--- a/internal/devicemapper/devicemapper.go
+++ b/internal/devicemapper/devicemapper.go
@ -0,0 +1,317 @@
+// Package devicemapper provides device-mapper snapshot operations for
+// copy-on-write rootfs management. Each sandbox gets a dm-snapshot backed
+// by a shared read-only loop device (the base template image) and a
+// per-sandbox sparse CoW file that stores only modified blocks.
+package devicemapper
+
+import (
+	"fmt"
+	"log/slog"
+	"os"
+	"os/exec"
+	"strconv"
+	"strings"
+	"sync"
+)
+
+const (
+	// ChunkSize is the dm-snapshot chunk size in 512-byte sectors.
+	// 8 sectors = 4KB, matching the standard page/block size.
+	ChunkSize = 8
+)
+
+// loopEntry tracks a loop device and its reference count.
+type loopEntry struct {
+	device   string // e.g., /dev/loop0
+	refcount int
+}
+
+// LoopRegistry manages loop devices for base template images.
+// Each unique image path gets one read-only loop device, shared
+// across all sandboxes using that template. Reference counting
+// ensures the loop device is released when no sandboxes use it.
+type LoopRegistry struct {
+	mu      sync.Mutex
+	entries map[string]*loopEntry // imagePath → loopEntry
+}
+
+// NewLoopRegistry creates a new loop device registry.
+func NewLoopRegistry() *LoopRegistry {
+	return &LoopRegistry{
+		entries: make(map[string]*loopEntry),
+	}
+}
+
+// Acquire returns a read-only loop device for the given image path.
+// If one already exists, its refcount is incremented. Otherwise a new
+// loop device is created via losetup.
+func (r *LoopRegistry) Acquire(imagePath string) (string, error) {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	if e, ok := r.entries[imagePath]; ok {
+		e.refcount++
+		slog.Debug("loop device reused", "image", imagePath, "device", e.device, "refcount", e.refcount)
+		return e.device, nil
+	}
+
+	dev, err := losetupCreate(imagePath)
+	if err != nil {
+		return "", fmt.Errorf("losetup %s: %w", imagePath, err)
+	}
+
+	r.entries[imagePath] = &loopEntry{device: dev, refcount: 1}
+	slog.Info("loop device created", "image", imagePath, "device", dev)
+	return dev, nil
+}
+
+// Release decrements the refcount for the given image path.
+// When the refcount reaches zero, the loop device is detached.
+func (r *LoopRegistry) Release(imagePath string) {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	e, ok := r.entries[imagePath]
+	if !ok {
+		return
+	}
+
+	e.refcount--
+	if e.refcount <= 0 {
+		if err := losetupDetach(e.device); err != nil {
+			slog.Warn("losetup detach failed", "device", e.device, "error", err)
+		}
+		delete(r.entries, imagePath)
+		slog.Info("loop device released", "image", imagePath, "device", e.device)
+	}
+}
+
+// ReleaseAll detaches all loop devices. Used during shutdown.
+func (r *LoopRegistry) ReleaseAll() {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	for path, e := range r.entries {
+		if err := losetupDetach(e.device); err != nil {
+			slog.Warn("losetup detach failed", "device", e.device, "error", err)
+		}
+		delete(r.entries, path)
+	}
+}
+
+// SnapshotDevice holds the state for a single dm-snapshot device.
+type SnapshotDevice struct {
+	Name       string // dm device name, e.g., "wrenn-sb-a1b2c3d4"
+	DevicePath string // /dev/mapper/<Name>
+	CowPath    string // path to the sparse CoW file
+	CowLoopDev string // loop device for the CoW file
+}
+
+// CreateSnapshot sets up a new dm-snapshot device.
+//
+// It creates a sparse CoW file, attaches it as a loop device, and creates
+// a device-mapper snapshot target combining the read-only origin with the
+// writable CoW layer.
+//
+// The origin loop device must already exist (from LoopRegistry.Acquire).
+func CreateSnapshot(name, originLoopDev, cowPath string, originSizeBytes int64) (*SnapshotDevice, error) {
+	// Create sparse CoW file sized to match the origin.
+	if err := createSparseFile(cowPath, originSizeBytes); err != nil {
+		return nil, fmt.Errorf("create cow file: %w", err)
+	}
+
+	cowLoopDev, err := losetupCreateRW(cowPath)
+	if err != nil {
+		os.Remove(cowPath)
+		return nil, fmt.Errorf("losetup cow: %w", err)
+	}
+
+	sectors := originSizeBytes / 512
+	if err := dmsetupCreate(name, originLoopDev, cowLoopDev, sectors); err != nil {
+		if detachErr := losetupDetach(cowLoopDev); detachErr != nil {
+			slog.Warn("cow losetup detach failed during cleanup", "device", cowLoopDev, "error", detachErr)
+		}
+		os.Remove(cowPath)
+		return nil, fmt.Errorf("dmsetup create: %w", err)
+	}
+
+	devPath := "/dev/mapper/" + name
+
+	slog.Info("dm-snapshot created",
+		"name", name,
+		"device", devPath,
+		"origin", originLoopDev,
+		"cow", cowPath,
+	)
+
+	return &SnapshotDevice{
+		Name:       name,
+		DevicePath: devPath,
+		CowPath:    cowPath,
+		CowLoopDev: cowLoopDev,
+	}, nil
+}
+
+// RestoreSnapshot re-attaches a dm-snapshot from an existing persistent CoW file.
+// The CoW file must have been created with the persistent (P) flag and still
+// contain valid dm-snapshot metadata.
+func RestoreSnapshot(name, originLoopDev, cowPath string, originSizeBytes int64) (*SnapshotDevice, error) {
+	cowLoopDev, err := losetupCreateRW(cowPath)
+	if err != nil {
+		return nil, fmt.Errorf("losetup cow: %w", err)
+	}
+
+	sectors := originSizeBytes / 512
+	if err := dmsetupCreate(name, originLoopDev, cowLoopDev, sectors); err != nil {
+		if detachErr := losetupDetach(cowLoopDev); detachErr != nil {
+			slog.Warn("cow losetup detach failed during cleanup", "device", cowLoopDev, "error", detachErr)
+		}
+		return nil, fmt.Errorf("dmsetup create: %w", err)
+	}
+
+	devPath := "/dev/mapper/" + name
+
+	slog.Info("dm-snapshot restored",
+		"name", name,
+		"device", devPath,
+		"origin", originLoopDev,
+		"cow", cowPath,
+	)
+
+	return &SnapshotDevice{
+		Name:       name,
+		DevicePath: devPath,
+		CowPath:    cowPath,
+		CowLoopDev: cowLoopDev,
+	}, nil
+}
+
+// RemoveSnapshot tears down a dm-snapshot device and its CoW loop device.
+// The CoW file is NOT deleted — the caller decides whether to keep or remove it.
+func RemoveSnapshot(dev *SnapshotDevice) error {
+	if err := dmsetupRemove(dev.Name); err != nil {
+		return fmt.Errorf("dmsetup remove %s: %w", dev.Name, err)
+	}
+
+	if err := losetupDetach(dev.CowLoopDev); err != nil {
+		slog.Warn("cow losetup detach failed", "device", dev.CowLoopDev, "error", err)
+	}
+
+	slog.Info("dm-snapshot removed", "name", dev.Name)
+	return nil
+}
+
+// FlattenSnapshot reads the full contents of a dm-snapshot device and writes
+// it to a new file. This merges the base image + CoW changes into a standalone
+// rootfs image suitable for use as a new template.
+func FlattenSnapshot(dmDevPath, outputPath string) error {
+	cmd := exec.Command("dd",
+		"if="+dmDevPath,
+		"of="+outputPath,
+		"bs=4M",
+		"status=none",
+	)
+	if out, err := cmd.CombinedOutput(); err != nil {
+		os.Remove(outputPath)
+		return fmt.Errorf("dd flatten: %s: %w", string(out), err)
+	}
+	return nil
+}
+
+// OriginSizeBytes returns the size in bytes of a loop device's backing file.
+func OriginSizeBytes(loopDev string) (int64, error) {
+	// blockdev --getsize64 returns size in bytes.
+	out, err := exec.Command("blockdev", "--getsize64", loopDev).CombinedOutput()
+	if err != nil {
+		return 0, fmt.Errorf("blockdev --getsize64 %s: %s: %w", loopDev, strings.TrimSpace(string(out)), err)
+	}
+	s := strings.TrimSpace(string(out))
+	return strconv.ParseInt(s, 10, 64)
+}
+
+// CleanupStaleDevices removes any device-mapper devices matching the
+// "wrenn-" prefix that may have been left behind by a previous agent
+// instance that crashed or was killed. Should be called at agent startup.
+func CleanupStaleDevices() {
+	out, err := exec.Command("dmsetup", "ls", "--target", "snapshot").CombinedOutput()
+	if err != nil {
+		slog.Debug("dmsetup ls failed (may be normal if no devices exist)", "error", err)
+		return
+	}
+
+	for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
+		if line == "" || line == "No devices found" {
+			continue
+		}
+		// dmsetup ls output format: "name\t(major:minor)"
+		name, _, _ := strings.Cut(line, "\t")
+		if !strings.HasPrefix(name, "wrenn-") {
+			continue
+		}
+
+		slog.Warn("removing stale dm-snapshot device", "name", name)
+		if err := dmsetupRemove(name); err != nil {
+			slog.Warn("failed to remove stale device", "name", name, "error", err)
+		}
+	}
+}
+
+// --- low-level helpers ---
+
+// losetupCreate attaches a file as a read-only loop device.
+func losetupCreate(imagePath string) (string, error) {
+	out, err := exec.Command("losetup", "--read-only", "--find", "--show", imagePath).Output()
+	if err != nil {
+		return "", fmt.Errorf("losetup --read-only: %w", err)
+	}
+	return strings.TrimSpace(string(out)), nil
+}
+
+// losetupCreateRW attaches a file as a read-write loop device.
+func losetupCreateRW(path string) (string, error) {
+	out, err := exec.Command("losetup", "--find", "--show", path).Output()
+	if err != nil {
+		return "", fmt.Errorf("losetup: %w", err)
+	}
+	return strings.TrimSpace(string(out)), nil
+}
+
+// losetupDetach detaches a loop device.
+func losetupDetach(dev string) error {
+	return exec.Command("losetup", "-d", dev).Run()
+}
+
+// dmsetupCreate creates a dm-snapshot device with persistent metadata.
+func dmsetupCreate(name, originDev, cowDev string, sectors int64) error {
+	// Table format: <start> <size> snapshot <origin> <cow> P <chunk_size>
+	// P = persistent — CoW metadata survives dmsetup remove.
+	table := fmt.Sprintf("0 %d snapshot %s %s P %d", sectors, originDev, cowDev, ChunkSize)
+	cmd := exec.Command("dmsetup", "create", name, "--table", table)
+	if out, err := cmd.CombinedOutput(); err != nil {
+		return fmt.Errorf("%s: %w", strings.TrimSpace(string(out)), err)
+	}
+	return nil
+}
+
+// dmsetupRemove removes a device-mapper device.
+func dmsetupRemove(name string) error {
+	cmd := exec.Command("dmsetup", "remove", name)
+	if out, err := cmd.CombinedOutput(); err != nil {
+		return fmt.Errorf("%s: %w", strings.TrimSpace(string(out)), err)
+	}
+	return nil
+}
+
+// createSparseFile creates a sparse file of the given size.
+func createSparseFile(path string, sizeBytes int64) error {
+	f, err := os.Create(path)
+	if err != nil {
+		return err
+	}
+	if err := f.Truncate(sizeBytes); err != nil {
+		f.Close()
+		os.Remove(path)
+		return err
+	}
+	return f.Close()
+}
--- a/internal/filesystem/clone.go
+++ b/internal/filesystem/clone.go
@ -1,16 +0,0 @@
-package filesystem
-
-import (
-	"fmt"
-	"os/exec"
-)
-
-// CloneRootfs creates a copy-on-write clone of the base rootfs image.
-// Uses reflink if supported by the filesystem, falls back to regular copy.
-func CloneRootfs(src, dst string) error {
-	cmd := exec.Command("cp", "--reflink=auto", src, dst)
-	if out, err := cmd.CombinedOutput(); err != nil {
-		return fmt.Errorf("cp --reflink=auto: %s: %w", string(out), err)
-	}
-	return nil
-}
--- a/internal/filesystem/images.go
+++ b/internal/filesystem/images.go
@ -1 +0,0 @@
-package filesystem
--- a/internal/network/setup.go
+++ b/internal/network/setup.go
@ -104,7 +104,7 @@ func CreateNetwork(slot *Slot) error {
 		return fmt.Errorf("get host namespace: %w", err)
 	}
 	defer hostNS.Close()
-	defer netns.Set(hostNS)
+	defer func() { _ = netns.Set(hostNS) }()

 	// Create named network namespace.
 	ns, err := netns.NewNamed(slot.NamespaceID)
@ -304,17 +304,17 @@ func RemoveNetwork(slot *Slot) error {

 	// Remove host-side iptables rules (best effort).
 	if defaultIface != "" {
-		iptablesHost(
+		_ = iptablesHost(
 			"-D", "FORWARD",
 			"-i", slot.VethName, "-o", defaultIface,
 			"-j", "ACCEPT",
 		)
-		iptablesHost(
+		_ = iptablesHost(
 			"-D", "FORWARD",
 			"-i", defaultIface, "-o", slot.VethName,
 			"-j", "ACCEPT",
 		)
-		iptablesHost(
+		_ = iptablesHost(
 			"-t", "nat", "-D", "POSTROUTING",
 			"-s", fmt.Sprintf("%s/32", slot.VpeerIP.String()),
 			"-o", defaultIface,
@ -324,18 +324,18 @@ func RemoveNetwork(slot *Slot) error {

 	// Remove host route.
 	_, hostNet, _ := net.ParseCIDR(fmt.Sprintf("%s/32", slot.HostIP.String()))
-	netlink.RouteDel(&netlink.Route{
+	_ = netlink.RouteDel(&netlink.Route{
 		Dst: hostNet,
 		Gw:  slot.VpeerIP,
 	})

 	// Delete veth (also destroys the peer in the namespace).
 	if veth, err := netlink.LinkByName(slot.VethName); err == nil {
-		netlink.LinkDel(veth)
+		_ = netlink.LinkDel(veth)
 	}

 	// Delete the named namespace.
-	netns.DeleteNamed(slot.NamespaceID)
+	_ = netns.DeleteNamed(slot.NamespaceID)

 	slog.Info("network removed", "ns", slot.NamespaceID)

--- a/internal/sandbox/manager.go
+++ b/internal/sandbox/manager.go
@ -11,8 +11,8 @@ import (

 	"github.com/google/uuid"

+	"git.omukk.dev/wrenn/sandbox/internal/devicemapper"
 	"git.omukk.dev/wrenn/sandbox/internal/envdclient"
-	"git.omukk.dev/wrenn/sandbox/internal/filesystem"
 	"git.omukk.dev/wrenn/sandbox/internal/id"
 	"git.omukk.dev/wrenn/sandbox/internal/models"
 	"git.omukk.dev/wrenn/sandbox/internal/network"
@ -35,6 +35,7 @@ type Manager struct {
 	cfg    Config
 	vm     *vm.Manager
 	slots  *network.SlotAllocator
+	loops  *devicemapper.LoopRegistry
 	mu     sync.RWMutex
 	boxes  map[string]*sandboxState
 	stopCh chan struct{}
@ -46,6 +47,8 @@ type sandboxState struct {
 	slot           *network.Slot
 	client         *envdclient.Client
 	uffdSocketPath string // non-empty for sandboxes restored from snapshot
+	dmDevice       *devicemapper.SnapshotDevice
+	baseImagePath  string // path to the base template rootfs (for loop registry release)
 }

 // New creates a new sandbox manager.
@ -57,6 +60,7 @@ func New(cfg Config) *Manager {
 		cfg:    cfg,
 		vm:     vm.NewManager(),
 		slots:  network.NewSlotAllocator(),
+		loops:  devicemapper.NewLoopRegistry(),
 		boxes:  make(map[string]*sandboxState),
 		stopCh: make(chan struct{}),
 	}
@ -91,16 +95,33 @@ func (m *Manager) Create(ctx context.Context, sandboxID, template string, vcpus,
 		return nil, fmt.Errorf("base rootfs not found at %s: %w", baseRootfs, err)
 	}

-	// Clone rootfs.
-	rootfsPath := filepath.Join(m.cfg.SandboxesDir, fmt.Sprintf("%s-%s.ext4", sandboxID, template))
-	if err := filesystem.CloneRootfs(baseRootfs, rootfsPath); err != nil {
-		return nil, fmt.Errorf("clone rootfs: %w", err)
+	// Acquire shared read-only loop device for the base image.
+	originLoop, err := m.loops.Acquire(baseRootfs)
+	if err != nil {
+		return nil, fmt.Errorf("acquire loop device: %w", err)
+	}
+
+	originSize, err := devicemapper.OriginSizeBytes(originLoop)
+	if err != nil {
+		m.loops.Release(baseRootfs)
+		return nil, fmt.Errorf("get origin size: %w", err)
+	}
+
+	// Create dm-snapshot with per-sandbox CoW file.
+	dmName := "wrenn-" + sandboxID
+	cowPath := filepath.Join(m.cfg.SandboxesDir, fmt.Sprintf("%s.cow", sandboxID))
+	dmDev, err := devicemapper.CreateSnapshot(dmName, originLoop, cowPath, originSize)
+	if err != nil {
+		m.loops.Release(baseRootfs)
+		return nil, fmt.Errorf("create dm-snapshot: %w", err)
 	}

 	// Allocate network slot.
 	slotIdx, err := m.slots.Allocate()
 	if err != nil {
-		os.Remove(rootfsPath)
+		warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev))
+		os.Remove(cowPath)
+		m.loops.Release(baseRootfs)
 		return nil, fmt.Errorf("allocate network slot: %w", err)
 	}
 	slot := network.NewSlot(slotIdx)
@ -108,15 +129,17 @@ func (m *Manager) Create(ctx context.Context, sandboxID, template string, vcpus,
 	// Set up network.
 	if err := network.CreateNetwork(slot); err != nil {
 		m.slots.Release(slotIdx)
-		os.Remove(rootfsPath)
+		warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev))
+		os.Remove(cowPath)
+		m.loops.Release(baseRootfs)
 		return nil, fmt.Errorf("create network: %w", err)
 	}

-	// Boot VM.
+	// Boot VM — Firecracker gets the dm device path.
 	vmCfg := vm.VMConfig{
 		SandboxID:        sandboxID,
 		KernelPath:       m.cfg.KernelPath,
-		RootfsPath:       rootfsPath,
+		RootfsPath:       dmDev.DevicePath,
 		VCPUs:            vcpus,
 		MemoryMB:         memoryMB,
 		NetworkNamespace: slot.NamespaceID,
@ -128,9 +151,11 @@ func (m *Manager) Create(ctx context.Context, sandboxID, template string, vcpus,
 	}

 	if _, err := m.vm.Create(ctx, vmCfg); err != nil {
-		network.RemoveNetwork(slot)
+		warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot))
 		m.slots.Release(slotIdx)
-		os.Remove(rootfsPath)
+		warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev))
+		os.Remove(cowPath)
+		m.loops.Release(baseRootfs)
 		return nil, fmt.Errorf("create VM: %w", err)
 	}

@ -140,10 +165,12 @@ func (m *Manager) Create(ctx context.Context, sandboxID, template string, vcpus,
 	defer waitCancel()

 	if err := client.WaitUntilReady(waitCtx); err != nil {
-		m.vm.Destroy(context.Background(), sandboxID)
-		network.RemoveNetwork(slot)
+		warnErr("vm destroy error", sandboxID, m.vm.Destroy(context.Background(), sandboxID))
+		warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot))
 		m.slots.Release(slotIdx)
-		os.Remove(rootfsPath)
+		warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev))
+		os.Remove(cowPath)
+		m.loops.Release(baseRootfs)
 		return nil, fmt.Errorf("wait for envd: %w", err)
 	}

@ -158,12 +185,14 @@ func (m *Manager) Create(ctx context.Context, sandboxID, template string, vcpus,
 			TimeoutSec:   timeoutSec,
 			SlotIndex:    slotIdx,
 			HostIP:       slot.HostIP,
-			RootfsPath:   rootfsPath,
+			RootfsPath:   dmDev.DevicePath,
 			CreatedAt:    now,
 			LastActiveAt: now,
 		},
-		slot:   slot,
-		client: client,
+		slot:          slot,
+		client:        client,
+		dmDevice:      dmDev,
+		baseImagePath: baseRootfs,
 	}

 	m.mu.Lock()
@ -174,6 +203,7 @@ func (m *Manager) Create(ctx context.Context, sandboxID, template string, vcpus,
 		"id", sandboxID,
 		"template", template,
 		"host_ip", slot.HostIP.String(),
+		"dm_device", dmDev.DevicePath,
 	)

 	return &sb.Sandbox, nil
@ -194,7 +224,7 @@ func (m *Manager) Destroy(ctx context.Context, sandboxID string) error {
 	}

 	// Always clean up pause snapshot files (may exist if sandbox was paused).
-	snapshot.Remove(m.cfg.SnapshotsDir, sandboxID)
+	warnErr("snapshot cleanup error", sandboxID, snapshot.Remove(m.cfg.SnapshotsDir, sandboxID))

 	slog.Info("sandbox destroyed", "id", sandboxID)
 	return nil
@ -209,7 +239,18 @@ func (m *Manager) cleanup(ctx context.Context, sb *sandboxState) {
 		slog.Warn("network cleanup error", "id", sb.ID, "error", err)
 	}
 	m.slots.Release(sb.SlotIndex)
-	os.Remove(sb.RootfsPath)
+
+	// Tear down dm-snapshot and release the base image loop device.
+	if sb.dmDevice != nil {
+		if err := devicemapper.RemoveSnapshot(sb.dmDevice); err != nil {
+			slog.Warn("dm-snapshot remove error", "id", sb.ID, "error", err)
+		}
+		os.Remove(sb.dmDevice.CowPath)
+	}
+	if sb.baseImagePath != "" {
+		m.loops.Release(sb.baseImagePath)
+	}
+
 	if sb.uffdSocketPath != "" {
 		os.Remove(sb.uffdSocketPath)
 	}
@ -228,12 +269,15 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error {
 		return fmt.Errorf("sandbox %s is not running (status: %s)", sandboxID, sb.Status)
 	}

+	pauseStart := time.Now()
+
 	// Step 1: Pause the VM (freeze vCPUs).
 	if err := m.vm.Pause(ctx, sandboxID); err != nil {
 		return fmt.Errorf("pause VM: %w", err)
 	}
+	slog.Debug("pause: VM paused", "id", sandboxID, "elapsed", time.Since(pauseStart))

-	// Step 2: Take a full snapshot (snapfile + memfile).
+	// Step 2: Take VM state snapshot (snapfile + memfile) — CoW file is saved separately.
 	if err := snapshot.EnsureDir(m.cfg.SnapshotsDir, sandboxID); err != nil {
 		return fmt.Errorf("create snapshot dir: %w", err)
 	}
@ -242,39 +286,74 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error {
 	rawMemPath := filepath.Join(snapDir, "memfile.raw")
 	snapPath := snapshot.SnapPath(m.cfg.SnapshotsDir, sandboxID)

+	// For UFFD-resumed sandboxes, FC must fault in ALL lazy-loaded pages to
+	// serialize memory — this is the main bottleneck on re-pause.
+	snapshotStart := time.Now()
 	if err := m.vm.Snapshot(ctx, sandboxID, snapPath, rawMemPath); err != nil {
-		snapshot.Remove(m.cfg.SnapshotsDir, sandboxID)
+		warnErr("snapshot dir cleanup error", sandboxID, snapshot.Remove(m.cfg.SnapshotsDir, sandboxID))
 		return fmt.Errorf("create VM snapshot: %w", err)
 	}
+	slog.Debug("pause: FC snapshot created", "id", sandboxID, "elapsed", time.Since(snapshotStart))

 	// Step 3: Process the raw memfile into a compact diff + header.
 	buildID := uuid.New()
 	diffPath := snapshot.MemDiffPath(m.cfg.SnapshotsDir, sandboxID)
 	headerPath := snapshot.MemHeaderPath(m.cfg.SnapshotsDir, sandboxID)

+	processStart := time.Now()
 	if _, err := snapshot.ProcessMemfile(rawMemPath, diffPath, headerPath, buildID); err != nil {
-		snapshot.Remove(m.cfg.SnapshotsDir, sandboxID)
+		warnErr("snapshot dir cleanup error", sandboxID, snapshot.Remove(m.cfg.SnapshotsDir, sandboxID))
 		return fmt.Errorf("process memfile: %w", err)
 	}
+	slog.Debug("pause: memfile processed", "id", sandboxID, "elapsed", time.Since(processStart))

 	// Remove the raw memfile — we only keep the compact diff.
 	os.Remove(rawMemPath)

-	// Step 4: Copy rootfs into snapshot dir.
-	snapshotRootfs := snapshot.RootfsPath(m.cfg.SnapshotsDir, sandboxID)
-	if err := filesystem.CloneRootfs(sb.RootfsPath, snapshotRootfs); err != nil {
-		snapshot.Remove(m.cfg.SnapshotsDir, sandboxID)
-		return fmt.Errorf("copy rootfs: %w", err)
+	// Step 4: Destroy the VM first so Firecracker releases the dm device.
+	if err := m.vm.Destroy(ctx, sb.ID); err != nil {
+		slog.Warn("vm destroy error during pause", "id", sb.ID, "error", err)
+	}
+
+	// Step 5: Now that FC is gone, safely remove the dm-snapshot and save the CoW.
+	if sb.dmDevice != nil {
+		if err := devicemapper.RemoveSnapshot(sb.dmDevice); err != nil {
+			warnErr("dm-snapshot remove error during pause", sandboxID, err)
+		}
+
+		// Move (not copy) the CoW file into the snapshot directory.
+		snapshotCow := snapshot.CowPath(m.cfg.SnapshotsDir, sandboxID)
+		if err := os.Rename(sb.dmDevice.CowPath, snapshotCow); err != nil {
+			warnErr("snapshot dir cleanup error", sandboxID, snapshot.Remove(m.cfg.SnapshotsDir, sandboxID))
+			return fmt.Errorf("move cow file: %w", err)
+		}
+
+		// Record which base template this CoW was built against.
+		if err := snapshot.WriteMeta(m.cfg.SnapshotsDir, sandboxID, &snapshot.RootfsMeta{
+			BaseTemplate: sb.baseImagePath,
+		}); err != nil {
+			warnErr("snapshot dir cleanup error", sandboxID, snapshot.Remove(m.cfg.SnapshotsDir, sandboxID))
+			return fmt.Errorf("write rootfs meta: %w", err)
+		}
+	}
+
+	// Step 6: Clean up remaining resources (network, loop device, uffd socket).
+	if err := network.RemoveNetwork(sb.slot); err != nil {
+		slog.Warn("network cleanup error during pause", "id", sb.ID, "error", err)
+	}
+	m.slots.Release(sb.SlotIndex)
+	if sb.baseImagePath != "" {
+		m.loops.Release(sb.baseImagePath)
+	}
+	if sb.uffdSocketPath != "" {
+		os.Remove(sb.uffdSocketPath)
 	}

-	// Step 5: Destroy the sandbox (free VM, network, rootfs clone).
 	m.mu.Lock()
 	delete(m.boxes, sandboxID)
 	m.mu.Unlock()

-	m.cleanup(ctx, sb)
-
-	slog.Info("sandbox paused (snapshot + destroy)", "id", sandboxID)
+	slog.Info("sandbox paused", "id", sandboxID, "total_elapsed", time.Since(pauseStart))
 	return nil
 }

@ -307,19 +386,62 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string) (*models.Sandbox
 		return nil, fmt.Errorf("create memory source: %w", err)
 	}

-	// Clone snapshot rootfs for this sandbox.
-	snapshotRootfs := snapshot.RootfsPath(snapDir, sandboxID)
-	rootfsPath := filepath.Join(m.cfg.SandboxesDir, fmt.Sprintf("%s-resume.ext4", sandboxID))
-	if err := filesystem.CloneRootfs(snapshotRootfs, rootfsPath); err != nil {
+	// Read rootfs metadata to find the base template image.
+	meta, err := snapshot.ReadMeta(snapDir, sandboxID)
+	if err != nil {
 		source.Close()
-		return nil, fmt.Errorf("clone snapshot rootfs: %w", err)
+		return nil, fmt.Errorf("read rootfs meta: %w", err)
+	}
+
+	// Acquire the base image loop device and restore dm-snapshot from saved CoW.
+	baseImagePath := meta.BaseTemplate
+	originLoop, err := m.loops.Acquire(baseImagePath)
+	if err != nil {
+		source.Close()
+		return nil, fmt.Errorf("acquire loop device: %w", err)
+	}
+
+	originSize, err := devicemapper.OriginSizeBytes(originLoop)
+	if err != nil {
+		source.Close()
+		m.loops.Release(baseImagePath)
+		return nil, fmt.Errorf("get origin size: %w", err)
+	}
+
+	// Move CoW file from snapshot dir to sandboxes dir for the running sandbox.
+	savedCow := snapshot.CowPath(snapDir, sandboxID)
+	cowPath := filepath.Join(m.cfg.SandboxesDir, fmt.Sprintf("%s.cow", sandboxID))
+	if err := os.Rename(savedCow, cowPath); err != nil {
+		source.Close()
+		m.loops.Release(baseImagePath)
+		return nil, fmt.Errorf("move cow file: %w", err)
+	}
+
+	// rollbackCow attempts to move the CoW file back to the snapshot dir.
+	// Best-effort — logs a warning if it fails.
+	rollbackCow := func() {
+		if err := os.Rename(cowPath, savedCow); err != nil {
+			slog.Warn("failed to rollback cow file", "src", cowPath, "dst", savedCow, "error", err)
+		}
+	}
+
+	// Restore dm-snapshot from existing persistent CoW file.
+	dmName := "wrenn-" + sandboxID
+	dmDev, err := devicemapper.RestoreSnapshot(dmName, originLoop, cowPath, originSize)
+	if err != nil {
+		source.Close()
+		m.loops.Release(baseImagePath)
+		rollbackCow()
+		return nil, fmt.Errorf("restore dm-snapshot: %w", err)
 	}

 	// Allocate network slot.
 	slotIdx, err := m.slots.Allocate()
 	if err != nil {
 		source.Close()
-		os.Remove(rootfsPath)
+		warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev))
+		rollbackCow()
+		m.loops.Release(baseImagePath)
 		return nil, fmt.Errorf("allocate network slot: %w", err)
 	}
 	slot := network.NewSlot(slotIdx)
@ -327,7 +449,9 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string) (*models.Sandbox
 	if err := network.CreateNetwork(slot); err != nil {
 		source.Close()
 		m.slots.Release(slotIdx)
-		os.Remove(rootfsPath)
+		warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev))
+		rollbackCow()
+		m.loops.Release(baseImagePath)
 		return nil, fmt.Errorf("create network: %w", err)
 	}

@ -337,9 +461,11 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string) (*models.Sandbox
 	uffdServer := uffd.NewServer(uffdSocketPath, source)
 	if err := uffdServer.Start(ctx); err != nil {
 		source.Close()
-		network.RemoveNetwork(slot)
+		warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot))
 		m.slots.Release(slotIdx)
-		os.Remove(rootfsPath)
+		warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev))
+		rollbackCow()
+		m.loops.Release(baseImagePath)
 		return nil, fmt.Errorf("start uffd server: %w", err)
 	}

@ -347,7 +473,7 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string) (*models.Sandbox
 	vmCfg := vm.VMConfig{
 		SandboxID:        sandboxID,
 		KernelPath:       m.cfg.KernelPath,
-		RootfsPath:       rootfsPath,
+		RootfsPath:       dmDev.DevicePath,
 		VCPUs:            int(header.Metadata.Size / (1024 * 1024)), // Will be overridden by snapshot.
 		MemoryMB:         int(header.Metadata.Size / (1024 * 1024)),
 		NetworkNamespace: slot.NamespaceID,
@ -360,11 +486,13 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string) (*models.Sandbox

 	snapPath := snapshot.SnapPath(snapDir, sandboxID)
 	if _, err := m.vm.CreateFromSnapshot(ctx, vmCfg, snapPath, uffdSocketPath); err != nil {
-		uffdServer.Stop()
+		warnErr("uffd server stop error", sandboxID, uffdServer.Stop())
 		source.Close()
-		network.RemoveNetwork(slot)
+		warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot))
 		m.slots.Release(slotIdx)
-		os.Remove(rootfsPath)
+		warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev))
+		rollbackCow()
+		m.loops.Release(baseImagePath)
 		return nil, fmt.Errorf("restore VM from snapshot: %w", err)
 	}

@ -374,12 +502,14 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string) (*models.Sandbox
 	defer waitCancel()

 	if err := client.WaitUntilReady(waitCtx); err != nil {
-		uffdServer.Stop()
+		warnErr("uffd server stop error", sandboxID, uffdServer.Stop())
 		source.Close()
-		m.vm.Destroy(context.Background(), sandboxID)
-		network.RemoveNetwork(slot)
+		warnErr("vm destroy error", sandboxID, m.vm.Destroy(context.Background(), sandboxID))
+		warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot))
 		m.slots.Release(slotIdx)
-		os.Remove(rootfsPath)
+		warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev))
+		os.Remove(cowPath)
+		m.loops.Release(baseImagePath)
 		return nil, fmt.Errorf("wait for envd: %w", err)
 	}

@ -394,25 +524,29 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string) (*models.Sandbox
 			TimeoutSec:   0,
 			SlotIndex:    slotIdx,
 			HostIP:       slot.HostIP,
-			RootfsPath:   rootfsPath,
+			RootfsPath:   dmDev.DevicePath,
 			CreatedAt:    now,
 			LastActiveAt: now,
 		},
 		slot:           slot,
 		client:         client,
 		uffdSocketPath: uffdSocketPath,
+		dmDevice:       dmDev,
+		baseImagePath:  baseImagePath,
 	}

 	m.mu.Lock()
 	m.boxes[sandboxID] = sb
 	m.mu.Unlock()

-	// Clean up the snapshot files now that the sandbox is running.
-	snapshot.Remove(snapDir, sandboxID)
+	// Clean up remaining snapshot files (snapfile, memfile, header, meta).
+	// The CoW file was already moved out.
+	warnErr("snapshot cleanup error", sandboxID, snapshot.Remove(snapDir, sandboxID))

 	slog.Info("sandbox resumed from snapshot",
 		"id", sandboxID,
 		"host_ip", slot.HostIP.String(),
+		"dm_device", dmDev.DevicePath,
 	)

 	return &sb.Sandbox, nil
@ -421,7 +555,10 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string) (*models.Sandbox
 // CreateSnapshot creates a reusable template from a sandbox. Works on both
 // running and paused sandboxes. If the sandbox is running, it is paused first.
 // The sandbox remains paused after this call (it can still be resumed).
-// The template files are copied to ImagesDir/{name}/.
+//
+// The rootfs is flattened (base + CoW merged) into a new standalone rootfs.ext4
+// so the template has no dependency on the original base image. Memory state
+// and VM snapshot files are copied as-is.
 func (m *Manager) CreateSnapshot(ctx context.Context, sandboxID, name string) (int64, error) {
 	// If the sandbox is running, pause it first.
 	if _, err := m.get(sandboxID); err == nil {
@ -435,29 +572,73 @@ func (m *Manager) CreateSnapshot(ctx context.Context, sandboxID, name string) (i
 		return 0, fmt.Errorf("no snapshot found for sandbox %s", sandboxID)
 	}

-	// Copy snapshot files to ImagesDir/{name}/ as a reusable template.
+	// Create template directory.
 	if err := snapshot.EnsureDir(m.cfg.ImagesDir, name); err != nil {
 		return 0, fmt.Errorf("create template dir: %w", err)
 	}

+	// Copy VM snapshot and memory files.
 	srcDir := snapshot.DirPath(m.cfg.SnapshotsDir, sandboxID)
 	dstDir := snapshot.DirPath(m.cfg.ImagesDir, name)

-	for _, fname := range []string{snapshot.SnapFileName, snapshot.MemDiffName, snapshot.MemHeaderName, snapshot.RootfsFileName} {
+	for _, fname := range []string{snapshot.SnapFileName, snapshot.MemDiffName, snapshot.MemHeaderName} {
 		src := filepath.Join(srcDir, fname)
 		dst := filepath.Join(dstDir, fname)
-		if err := filesystem.CloneRootfs(src, dst); err != nil {
-			snapshot.Remove(m.cfg.ImagesDir, name)
+		if err := copyFile(src, dst); err != nil {
+			warnErr("template dir cleanup error", name, snapshot.Remove(m.cfg.ImagesDir, name))
 			return 0, fmt.Errorf("copy %s: %w", fname, err)
 		}
 	}

+	// Flatten rootfs: temporarily set up dm device from base + CoW, dd to new image.
+	meta, err := snapshot.ReadMeta(m.cfg.SnapshotsDir, sandboxID)
+	if err != nil {
+		warnErr("template dir cleanup error", name, snapshot.Remove(m.cfg.ImagesDir, name))
+		return 0, fmt.Errorf("read rootfs meta: %w", err)
+	}
+
+	originLoop, err := m.loops.Acquire(meta.BaseTemplate)
+	if err != nil {
+		warnErr("template dir cleanup error", name, snapshot.Remove(m.cfg.ImagesDir, name))
+		return 0, fmt.Errorf("acquire loop device for flatten: %w", err)
+	}
+
+	originSize, err := devicemapper.OriginSizeBytes(originLoop)
+	if err != nil {
+		m.loops.Release(meta.BaseTemplate)
+		warnErr("template dir cleanup error", name, snapshot.Remove(m.cfg.ImagesDir, name))
+		return 0, fmt.Errorf("get origin size: %w", err)
+	}
+
+	// Temporarily restore the dm-snapshot to read the merged view.
+	cowPath := snapshot.CowPath(m.cfg.SnapshotsDir, sandboxID)
+	tmpDmName := "wrenn-flatten-" + sandboxID
+	tmpDev, err := devicemapper.RestoreSnapshot(tmpDmName, originLoop, cowPath, originSize)
+	if err != nil {
+		m.loops.Release(meta.BaseTemplate)
+		warnErr("template dir cleanup error", name, snapshot.Remove(m.cfg.ImagesDir, name))
+		return 0, fmt.Errorf("restore dm-snapshot for flatten: %w", err)
+	}
+
+	// Flatten to new standalone rootfs.
+	flattenedPath := snapshot.RootfsPath(m.cfg.ImagesDir, name)
+	flattenErr := devicemapper.FlattenSnapshot(tmpDev.DevicePath, flattenedPath)
+
+	// Always clean up the temporary dm device.
+	warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(tmpDev))
+	m.loops.Release(meta.BaseTemplate)
+
+	if flattenErr != nil {
+		warnErr("template dir cleanup error", name, snapshot.Remove(m.cfg.ImagesDir, name))
+		return 0, fmt.Errorf("flatten rootfs: %w", flattenErr)
+	}
+
 	sizeBytes, err := snapshot.DirSize(m.cfg.ImagesDir, name)
 	if err != nil {
 		slog.Warn("failed to calculate snapshot size", "error", err)
 	}

-	slog.Info("snapshot created",
+	slog.Info("template snapshot created (rootfs flattened)",
 		"sandbox", sandboxID,
 		"name", name,
 		"size_bytes", sizeBytes,
@ -472,7 +653,9 @@ func (m *Manager) DeleteSnapshot(name string) error {

 // createFromSnapshot creates a new sandbox by restoring from a snapshot template
 // in ImagesDir/{snapshotName}/. Uses UFFD for lazy memory loading.
-func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID, snapshotName string, vcpus, memoryMB, timeoutSec int) (*models.Sandbox, error) {
+// The template's rootfs.ext4 is a flattened standalone image — we create a
+// dm-snapshot on top of it just like a normal Create.
+func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID, snapshotName string, vcpus, _, timeoutSec int) (*models.Sandbox, error) {
 	imagesDir := m.cfg.ImagesDir

 	// Read the header.
@ -486,10 +669,8 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID, snapshotNam
 		return nil, fmt.Errorf("deserialize header: %w", err)
 	}

-	// Snapshot determines memory size. VCPUs are also baked into the
-	// snapshot state — the caller should pass the correct value from
-	// the template DB record.
-	memoryMB = int(header.Metadata.Size / (1024 * 1024))
+	// Snapshot determines memory size.
+	memoryMB := int(header.Metadata.Size / (1024 * 1024))

 	// Build diff file map.
 	diffPaths := map[string]string{
@ -501,19 +682,37 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID, snapshotNam
 		return nil, fmt.Errorf("create memory source: %w", err)
 	}

-	// Clone snapshot rootfs.
-	snapshotRootfs := snapshot.RootfsPath(imagesDir, snapshotName)
-	rootfsPath := filepath.Join(m.cfg.SandboxesDir, fmt.Sprintf("%s-%s.ext4", sandboxID, snapshotName))
-	if err := filesystem.CloneRootfs(snapshotRootfs, rootfsPath); err != nil {
+	// Set up dm-snapshot on the template's flattened rootfs.
+	baseRootfs := snapshot.RootfsPath(imagesDir, snapshotName)
+	originLoop, err := m.loops.Acquire(baseRootfs)
+	if err != nil {
 		source.Close()
-		return nil, fmt.Errorf("clone snapshot rootfs: %w", err)
+		return nil, fmt.Errorf("acquire loop device: %w", err)
+	}
+
+	originSize, err := devicemapper.OriginSizeBytes(originLoop)
+	if err != nil {
+		source.Close()
+		m.loops.Release(baseRootfs)
+		return nil, fmt.Errorf("get origin size: %w", err)
+	}
+
+	dmName := "wrenn-" + sandboxID
+	cowPath := filepath.Join(m.cfg.SandboxesDir, fmt.Sprintf("%s.cow", sandboxID))
+	dmDev, err := devicemapper.CreateSnapshot(dmName, originLoop, cowPath, originSize)
+	if err != nil {
+		source.Close()
+		m.loops.Release(baseRootfs)
+		return nil, fmt.Errorf("create dm-snapshot: %w", err)
 	}

 	// Allocate network.
 	slotIdx, err := m.slots.Allocate()
 	if err != nil {
 		source.Close()
-		os.Remove(rootfsPath)
+		warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev))
+		os.Remove(cowPath)
+		m.loops.Release(baseRootfs)
 		return nil, fmt.Errorf("allocate network slot: %w", err)
 	}
 	slot := network.NewSlot(slotIdx)
@ -521,7 +720,9 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID, snapshotNam
 	if err := network.CreateNetwork(slot); err != nil {
 		source.Close()
 		m.slots.Release(slotIdx)
-		os.Remove(rootfsPath)
+		warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev))
+		os.Remove(cowPath)
+		m.loops.Release(baseRootfs)
 		return nil, fmt.Errorf("create network: %w", err)
 	}

@ -531,9 +732,11 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID, snapshotNam
 	uffdServer := uffd.NewServer(uffdSocketPath, source)
 	if err := uffdServer.Start(ctx); err != nil {
 		source.Close()
-		network.RemoveNetwork(slot)
+		warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot))
 		m.slots.Release(slotIdx)
-		os.Remove(rootfsPath)
+		warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev))
+		os.Remove(cowPath)
+		m.loops.Release(baseRootfs)
 		return nil, fmt.Errorf("start uffd server: %w", err)
 	}

@ -541,7 +744,7 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID, snapshotNam
 	vmCfg := vm.VMConfig{
 		SandboxID:        sandboxID,
 		KernelPath:       m.cfg.KernelPath,
-		RootfsPath:       rootfsPath,
+		RootfsPath:       dmDev.DevicePath,
 		VCPUs:            vcpus,
 		MemoryMB:         memoryMB,
 		NetworkNamespace: slot.NamespaceID,
@ -554,11 +757,13 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID, snapshotNam

 	snapPath := snapshot.SnapPath(imagesDir, snapshotName)
 	if _, err := m.vm.CreateFromSnapshot(ctx, vmCfg, snapPath, uffdSocketPath); err != nil {
-		uffdServer.Stop()
+		warnErr("uffd server stop error", sandboxID, uffdServer.Stop())
 		source.Close()
-		network.RemoveNetwork(slot)
+		warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot))
 		m.slots.Release(slotIdx)
-		os.Remove(rootfsPath)
+		warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev))
+		os.Remove(cowPath)
+		m.loops.Release(baseRootfs)
 		return nil, fmt.Errorf("restore VM from snapshot: %w", err)
 	}

@ -568,12 +773,14 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID, snapshotNam
 	defer waitCancel()

 	if err := client.WaitUntilReady(waitCtx); err != nil {
-		uffdServer.Stop()
+		warnErr("uffd server stop error", sandboxID, uffdServer.Stop())
 		source.Close()
-		m.vm.Destroy(context.Background(), sandboxID)
-		network.RemoveNetwork(slot)
+		warnErr("vm destroy error", sandboxID, m.vm.Destroy(context.Background(), sandboxID))
+		warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot))
 		m.slots.Release(slotIdx)
-		os.Remove(rootfsPath)
+		warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev))
+		os.Remove(cowPath)
+		m.loops.Release(baseRootfs)
 		return nil, fmt.Errorf("wait for envd: %w", err)
 	}

@ -588,13 +795,15 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID, snapshotNam
 			TimeoutSec:   timeoutSec,
 			SlotIndex:    slotIdx,
 			HostIP:       slot.HostIP,
-			RootfsPath:   rootfsPath,
+			RootfsPath:   dmDev.DevicePath,
 			CreatedAt:    now,
 			LastActiveAt: now,
 		},
 		slot:           slot,
 		client:         client,
 		uffdSocketPath: uffdSocketPath,
+		dmDevice:       dmDev,
+		baseImagePath:  baseRootfs,
 	}

 	m.mu.Lock()
@ -605,6 +814,7 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID, snapshotNam
 		"id", sandboxID,
 		"snapshot", snapshotName,
 		"host_ip", slot.HostIP.String(),
+		"dm_device", dmDev.DevicePath,
 	)

 	return &sb.Sandbox, nil
@ -735,7 +945,7 @@ func (m *Manager) reapExpired(ctx context.Context) {
 	}
 }

-// Shutdown destroys all sandboxes and stops the TTL reaper.
+// Shutdown destroys all sandboxes, releases loop devices, and stops the TTL reaper.
 func (m *Manager) Shutdown(ctx context.Context) {
 	close(m.stopCh)

@ -752,4 +962,35 @@ func (m *Manager) Shutdown(ctx context.Context) {
 			slog.Warn("shutdown destroy failed", "id", sbID, "error", err)
 		}
 	}
+
+	m.loops.ReleaseAll()
+}
+
+// warnErr logs a warning if err is non-nil. Used for best-effort cleanup
+// in error paths where the primary error has already been captured.
+func warnErr(msg string, id string, err error) {
+	if err != nil {
+		slog.Warn(msg, "id", id, "error", err)
+	}
+}
+
+// copyFile copies a regular file from src to dst using streaming I/O.
+func copyFile(src, dst string) error {
+	sf, err := os.Open(src)
+	if err != nil {
+		return fmt.Errorf("open %s: %w", src, err)
+	}
+	defer sf.Close()
+
+	df, err := os.Create(dst)
+	if err != nil {
+		return fmt.Errorf("create %s: %w", dst, err)
+	}
+	defer df.Close()
+
+	if _, err := df.ReadFrom(sf); err != nil {
+		os.Remove(dst)
+		return fmt.Errorf("copy %s → %s: %w", src, dst, err)
+	}
+	return nil
 }
--- a/internal/snapshot/local.go
+++ b/internal/snapshot/local.go
@ -1,10 +1,12 @@
 package snapshot

 import (
+	"encoding/json"
 	"fmt"
 	"io/fs"
 	"os"
 	"path/filepath"
+	"syscall"
 )

 const (
@ -12,6 +14,8 @@ const (
 	MemDiffName    = "memfile"
 	MemHeaderName  = "memfile.header"
 	RootfsFileName = "rootfs.ext4"
+	RootfsCowName  = "rootfs.cow"
+	RootfsMetaName = "rootfs.meta"
 )

 // DirPath returns the snapshot directory for a given name.
@ -39,15 +43,68 @@ func RootfsPath(baseDir, name string) string {
 	return filepath.Join(DirPath(baseDir, name), RootfsFileName)
 }

+// CowPath returns the path to the rootfs CoW diff file.
+func CowPath(baseDir, name string) string {
+	return filepath.Join(DirPath(baseDir, name), RootfsCowName)
+}
+
+// MetaPath returns the path to the rootfs metadata file.
+func MetaPath(baseDir, name string) string {
+	return filepath.Join(DirPath(baseDir, name), RootfsMetaName)
+}
+
+// RootfsMeta records which base template a CoW file was created against.
+type RootfsMeta struct {
+	BaseTemplate string `json:"base_template"`
+}
+
+// WriteMeta writes rootfs metadata to the snapshot directory.
+func WriteMeta(baseDir, name string, meta *RootfsMeta) error {
+	data, err := json.Marshal(meta)
+	if err != nil {
+		return fmt.Errorf("marshal rootfs meta: %w", err)
+	}
+	if err := os.WriteFile(MetaPath(baseDir, name), data, 0644); err != nil {
+		return fmt.Errorf("write rootfs meta: %w", err)
+	}
+	return nil
+}
+
+// ReadMeta reads rootfs metadata from the snapshot directory.
+func ReadMeta(baseDir, name string) (*RootfsMeta, error) {
+	data, err := os.ReadFile(MetaPath(baseDir, name))
+	if err != nil {
+		return nil, fmt.Errorf("read rootfs meta: %w", err)
+	}
+	var meta RootfsMeta
+	if err := json.Unmarshal(data, &meta); err != nil {
+		return nil, fmt.Errorf("unmarshal rootfs meta: %w", err)
+	}
+	return &meta, nil
+}
+
 // Exists reports whether a complete snapshot exists (all required files present).
+// Supports both legacy (rootfs.ext4) and CoW-based (rootfs.cow + rootfs.meta) snapshots.
 func Exists(baseDir, name string) bool {
 	dir := DirPath(baseDir, name)
-	for _, f := range []string{SnapFileName, MemDiffName, MemHeaderName, RootfsFileName} {
+
+	// Common files required by both formats.
+	for _, f := range []string{SnapFileName, MemDiffName, MemHeaderName} {
 		if _, err := os.Stat(filepath.Join(dir, f)); err != nil {
 			return false
 		}
 	}
-	return true
+
+	// Accept either rootfs.ext4 (legacy) or rootfs.cow + rootfs.meta (dm-snapshot).
+	if _, err := os.Stat(filepath.Join(dir, RootfsFileName)); err == nil {
+		return true
+	}
+	if _, err := os.Stat(filepath.Join(dir, RootfsCowName)); err == nil {
+		if _, err := os.Stat(filepath.Join(dir, RootfsMetaName)); err == nil {
+			return true
+		}
+	}
+	return false
 }

 // IsTemplate reports whether a template image directory exists (has rootfs.ext4).
@ -61,6 +118,15 @@ func IsSnapshot(baseDir, name string) bool {
 	return Exists(baseDir, name)
 }

+// HasCow reports whether a snapshot uses CoW format (rootfs.cow + rootfs.meta)
+// as opposed to legacy full rootfs (rootfs.ext4).
+func HasCow(baseDir, name string) bool {
+	dir := DirPath(baseDir, name)
+	_, cowErr := os.Stat(filepath.Join(dir, RootfsCowName))
+	_, metaErr := os.Stat(filepath.Join(dir, RootfsMetaName))
+	return cowErr == nil && metaErr == nil
+}
+
 // EnsureDir creates the snapshot directory if it doesn't exist.
 func EnsureDir(baseDir, name string) error {
 	dir := DirPath(baseDir, name)
@ -75,12 +141,14 @@ func Remove(baseDir, name string) error {
 	return os.RemoveAll(DirPath(baseDir, name))
 }

-// DirSize returns the total byte size of all files in the snapshot directory.
+// DirSize returns the actual disk usage of all files in the snapshot directory.
+// Uses block-based accounting (stat.Blocks * 512) so sparse files report only
+// the blocks that are actually allocated, not their apparent size.
 func DirSize(baseDir, name string) (int64, error) {
 	var total int64
 	dir := DirPath(baseDir, name)

-	err := filepath.WalkDir(dir, func(_ string, d fs.DirEntry, err error) error {
+	err := filepath.WalkDir(dir, func(path string, d fs.DirEntry, err error) error {
 		if err != nil {
 			return err
 		}
@ -91,7 +159,13 @@ func DirSize(baseDir, name string) (int64, error) {
 		if err != nil {
 			return err
 		}
-		total += info.Size()
+		if sys, ok := info.Sys().(*syscall.Stat_t); ok {
+			// Blocks is in 512-byte units regardless of filesystem block size.
+			total += sys.Blocks * 512
+		} else {
+			// Fallback to apparent size if syscall stat is unavailable.
+			total += info.Size()
+		}
 		return nil
 	})
 	if err != nil {
--- a/internal/uffd/server.go
+++ b/internal/uffd/server.go
@ -115,7 +115,7 @@ func (s *Server) Ready() <-chan struct{} {
 // Stop signals the UFFD poll loop to exit and waits for it to finish.
 func (s *Server) Stop() error {
 	// Write a byte to the exit pipe to wake the poll loop.
-	s.exitW.Write([]byte{0})
+	_, _ = s.exitW.Write([]byte{0})
 	<-s.doneCh
 	return s.doneErr
 }
--- a/internal/vm/config.go
+++ b/internal/vm/config.go
@ -10,8 +10,8 @@ type VMConfig struct {
 	// KernelPath is the path to the uncompressed Linux kernel (vmlinux).
 	KernelPath string

-	// RootfsPath is the path to the ext4 rootfs image for this sandbox.
-	// This should be a per-sandbox copy (reflink clone of the base image).
+	// RootfsPath is the path to the rootfs block device for this sandbox.
+	// Typically a dm-snapshot device (e.g., /dev/mapper/wrenn-sb-a1b2c3d4).
 	RootfsPath string

 	// VCPUs is the number of virtual CPUs to allocate (default: 1).
--- a/internal/vm/fc.go
+++ b/internal/vm/fc.go
@ -131,15 +131,6 @@ func (c *fcClient) createSnapshot(ctx context.Context, snapPath, memPath string)
 	})
 }

-// loadSnapshot loads a VM snapshot from a file-backed memory image.
-func (c *fcClient) loadSnapshot(ctx context.Context, snapPath, memPath string) error {
-	return c.do(ctx, http.MethodPut, "/snapshot/load", map[string]any{
-		"snapshot_path": snapPath,
-		"mem_file_path": memPath,
-		"resume_vm":     false,
-	})
-}
-
 // loadSnapshotWithUffd loads a VM snapshot using a UFFD socket for
 // lazy memory loading. Firecracker will connect to the socket and
 // send the uffd fd + memory region mappings.
--- a/internal/vm/manager.go
+++ b/internal/vm/manager.go
@ -53,7 +53,7 @@ func (m *Manager) Create(ctx context.Context, cfg VMConfig) (*VM, error) {

 	// Step 2: Wait for the API socket to appear.
 	if err := waitForSocket(ctx, cfg.SocketPath, proc); err != nil {
-		proc.stop()
+		_ = proc.stop()
 		return nil, fmt.Errorf("wait for socket: %w", err)
 	}

@ -61,13 +61,13 @@ func (m *Manager) Create(ctx context.Context, cfg VMConfig) (*VM, error) {
 	client := newFCClient(cfg.SocketPath)

 	if err := configureVM(ctx, client, &cfg); err != nil {
-		proc.stop()
+		_ = proc.stop()
 		return nil, fmt.Errorf("configure VM: %w", err)
 	}

 	// Step 4: Start the VM.
 	if err := client.startVM(ctx); err != nil {
-		proc.stop()
+		_ = proc.stop()
 		return nil, fmt.Errorf("start VM: %w", err)
 	}

@ -218,7 +218,7 @@ func (m *Manager) CreateFromSnapshot(ctx context.Context, cfg VMConfig, snapPath

 	// Step 2: Wait for the API socket.
 	if err := waitForSocket(ctx, cfg.SocketPath, proc); err != nil {
-		proc.stop()
+		_ = proc.stop()
 		return nil, fmt.Errorf("wait for socket: %w", err)
 	}

@ -228,13 +228,13 @@ func (m *Manager) CreateFromSnapshot(ctx context.Context, cfg VMConfig, snapPath
 	// No boot resources are configured — the snapshot carries kernel,
 	// drive, network, and machine config state.
 	if err := client.loadSnapshotWithUffd(ctx, snapPath, uffdSocketPath); err != nil {
-		proc.stop()
+		_ = proc.stop()
 		return nil, fmt.Errorf("load snapshot: %w", err)
 	}

 	// Step 4: Resume the VM.
 	if err := client.resumeVM(ctx); err != nil {
-		proc.stop()
+		_ = proc.stop()
 		return nil, fmt.Errorf("resume VM: %w", err)
 	}