Add device-mapper snapshots, test UI, fix pause ordering and lint errors

- Replace reflink rootfs copy with device-mapper snapshots (shared
  read-only loop device per base template, per-sandbox sparse CoW file)
- Add devicemapper package with create/restore/remove/flatten operations
  and refcounted LoopRegistry for base image loop devices
- Fix pause ordering: destroy VM before removing dm-snapshot to avoid
  "device busy" error (FC must release the dm device first)
- Add test UI at GET /test for sandbox lifecycle management (create,
  pause, resume, destroy, exec, snapshot create/list/delete)
- Fix DirSize to report actual disk usage (stat.Blocks * 512) instead
  of apparent size, so sparse CoW files report correctly
- Add timing logs to pause flow for performance diagnostics
- Fix all lint errors across api, network, vm, uffd, and sandbox packages
- Remove obsolete internal/filesystem package (replaced by devicemapper)
- Update CLAUDE.md with device-mapper architecture documentation
This commit is contained in:
2026-03-13 08:25:40 +06:00
parent 778894b488
commit 63e9132d38
23 changed files with 1202 additions and 155 deletions

View File

@ -3,6 +3,7 @@ package api
import (
"encoding/base64"
"encoding/json"
"log/slog"
"net/http"
"time"
"unicode/utf8"
@ -85,13 +86,15 @@ func (h *execHandler) Exec(w http.ResponseWriter, r *http.Request) {
duration := time.Since(start)
// Update last active.
h.db.UpdateLastActive(ctx, db.UpdateLastActiveParams{
if err := h.db.UpdateLastActive(ctx, db.UpdateLastActiveParams{
ID: sandboxID,
LastActiveAt: pgtype.Timestamptz{
Time: time.Now(),
Valid: true,
},
})
}); err != nil {
slog.Warn("failed to update last_active_at", "id", sandboxID, "error", err)
}
// Use base64 encoding if output contains non-UTF-8 bytes.
stdout := resp.Msg.Stdout

View File

@ -37,11 +37,6 @@ type wsStartMsg struct {
Args []string `json:"args"`
}
// wsStopMsg is sent by the client to stop the process.
type wsStopMsg struct {
Type string `json:"type"` // "stop"
}
// wsOutMsg is sent by the server for process events.
type wsOutMsg struct {
Type string `json:"type"` // "start", "stdout", "stderr", "exit", "error"

View File

@ -128,5 +128,5 @@ func (h *filesHandler) Download(w http.ResponseWriter, r *http.Request) {
}
w.Header().Set("Content-Type", "application/octet-stream")
w.Write(resp.Msg.Content)
_, _ = w.Write(resp.Msg.Content)
}

View File

@ -2,6 +2,7 @@ package api
import (
"io"
"log/slog"
"mime"
"mime/multipart"
"net/http"
@ -189,6 +190,6 @@ func (h *filesStreamHandler) StreamDownload(w http.ResponseWriter, r *http.Reque
if err := stream.Err(); err != nil {
// Headers already sent, nothing we can do but log.
// The client will see a truncated response.
slog.Warn("file stream error after headers sent", "error", err)
}
}

View File

@ -111,7 +111,7 @@ func (h *sandboxHandler) Create(w http.ResponseWriter, r *http.Request) {
sandboxID := id.NewSandboxID()
// Insert pending record.
sb, err := h.db.InsertSandbox(ctx, db.InsertSandboxParams{
_, err := h.db.InsertSandbox(ctx, db.InsertSandboxParams{
ID: sandboxID,
OwnerID: "",
HostID: "default",
@ -136,9 +136,11 @@ func (h *sandboxHandler) Create(w http.ResponseWriter, r *http.Request) {
TimeoutSec: req.TimeoutSec,
}))
if err != nil {
h.db.UpdateSandboxStatus(ctx, db.UpdateSandboxStatusParams{
if _, dbErr := h.db.UpdateSandboxStatus(ctx, db.UpdateSandboxStatusParams{
ID: sandboxID, Status: "error",
})
}); dbErr != nil {
slog.Warn("failed to update sandbox status to error", "id", sandboxID, "error", dbErr)
}
status, code, msg := agentErrToHTTP(err)
writeError(w, status, code, msg)
return
@ -146,7 +148,7 @@ func (h *sandboxHandler) Create(w http.ResponseWriter, r *http.Request) {
// Update to running.
now := time.Now()
sb, err = h.db.UpdateSandboxRunning(ctx, db.UpdateSandboxRunningParams{
sb, err := h.db.UpdateSandboxRunning(ctx, db.UpdateSandboxRunningParams{
ID: sandboxID,
HostIp: resp.Msg.HostIp,
GuestIp: "",

View File

@ -84,7 +84,9 @@ func (h *snapshotHandler) Create(w http.ResponseWriter, r *http.Request) {
return
}
// Delete existing template record and files.
h.db.DeleteTemplate(ctx, req.Name)
if err := h.db.DeleteTemplate(ctx, req.Name); err != nil {
slog.Warn("failed to delete existing template", "name", req.Name, "error", err)
}
}
// Verify sandbox exists and is running or paused.

View File

@ -0,0 +1,435 @@
package api
import (
"fmt"
"net/http"
)
func serveTestUI(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html; charset=utf-8")
fmt.Fprint(w, testUIHTML)
}
const testUIHTML = `<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Wrenn Sandbox — Test Console</title>
<style>
* { box-sizing: border-box; margin: 0; padding: 0; }
body {
font-family: 'Menlo', 'Consolas', 'JetBrains Mono', monospace;
font-size: 13px;
background: #0f1211;
color: #c8c4bc;
padding: 16px;
}
h1 { font-size: 18px; color: #e8e5df; margin-bottom: 12px; }
h2 { font-size: 14px; color: #89a785; margin: 16px 0 8px; border-bottom: 1px solid #262c2a; padding-bottom: 4px; }
.grid { display: grid; grid-template-columns: 1fr 1fr; gap: 16px; }
.panel {
background: #151918;
border: 1px solid #262c2a;
border-radius: 8px;
padding: 12px;
}
.full { grid-column: 1 / -1; }
label { display: block; color: #8a867f; margin: 6px 0 2px; font-size: 11px; text-transform: uppercase; letter-spacing: 0.05em; }
input, select {
width: 100%;
background: #1b201e;
border: 1px solid #262c2a;
color: #e8e5df;
padding: 6px 8px;
border-radius: 4px;
font-family: inherit;
font-size: 13px;
}
input:focus, select:focus { outline: none; border-color: #5e8c58; }
.btn-row { display: flex; gap: 6px; margin-top: 8px; flex-wrap: wrap; }
button {
padding: 6px 14px;
border: 1px solid #262c2a;
border-radius: 4px;
font-family: inherit;
font-size: 12px;
cursor: pointer;
background: #1b201e;
color: #c8c4bc;
transition: all 0.15s;
}
button:hover { border-color: #5e8c58; color: #e8e5df; }
.btn-green { background: #2a3d28; border-color: #5e8c58; color: #89a785; }
.btn-green:hover { background: #3a5035; }
.btn-red { background: #3d2828; border-color: #b35544; color: #c27b6d; }
.btn-red:hover { background: #4d3030; }
.btn-amber { background: #3d3428; border-color: #9e7c2e; color: #c8a84e; }
.btn-amber:hover { background: #4d4030; }
.btn-blue { background: #28343d; border-color: #3d7aac; color: #6da0cc; }
.btn-blue:hover { background: #304050; }
table { width: 100%; border-collapse: collapse; margin-top: 8px; }
th { text-align: left; font-size: 11px; color: #8a867f; text-transform: uppercase; letter-spacing: 0.05em; padding: 4px 8px; border-bottom: 1px solid #262c2a; }
td { padding: 6px 8px; border-bottom: 1px solid #1b201e; }
tr:hover td { background: #1b201e; }
.status { display: inline-block; padding: 2px 8px; border-radius: 10px; font-size: 11px; font-weight: 600; }
.status-running { background: rgba(94,140,88,0.15); color: #89a785; }
.status-paused { background: rgba(158,124,46,0.15); color: #c8a84e; }
.status-pending { background: rgba(61,122,172,0.15); color: #6da0cc; }
.status-stopped { background: rgba(138,134,127,0.15); color: #8a867f; }
.status-error { background: rgba(179,85,68,0.15); color: #c27b6d; }
.status-hibernated { background: rgba(61,122,172,0.15); color: #6da0cc; }
.log {
background: #0f1211;
border: 1px solid #262c2a;
border-radius: 4px;
padding: 8px;
max-height: 300px;
overflow-y: auto;
margin-top: 8px;
font-size: 12px;
white-space: pre-wrap;
word-break: break-all;
}
.log-entry { margin-bottom: 4px; }
.log-time { color: #5f5c57; }
.log-ok { color: #89a785; }
.log-err { color: #c27b6d; }
.log-info { color: #6da0cc; }
.exec-output {
background: #0f1211;
border: 1px solid #262c2a;
border-radius: 4px;
padding: 8px;
max-height: 200px;
overflow-y: auto;
margin-top: 8px;
font-size: 12px;
white-space: pre-wrap;
}
.clickable { cursor: pointer; color: #89a785; text-decoration: underline; }
.clickable:hover { color: #aacdaa; }
</style>
</head>
<body>
<h1>Wrenn Sandbox Test Console</h1>
<div class="grid">
<!-- Create Sandbox -->
<div class="panel">
<h2>Create Sandbox</h2>
<label>Template</label>
<input type="text" id="create-template" value="minimal" placeholder="minimal or snapshot name">
<label>vCPUs</label>
<input type="number" id="create-vcpus" value="1" min="1" max="8">
<label>Memory (MB)</label>
<input type="number" id="create-memory" value="512" min="128" max="8192">
<label>Timeout (sec)</label>
<input type="number" id="create-timeout" value="300" min="30">
<div class="btn-row">
<button class="btn-green" onclick="createSandbox()">Create</button>
</div>
</div>
<!-- Snapshot Management -->
<div class="panel">
<h2>Create Snapshot</h2>
<label>Sandbox ID</label>
<input type="text" id="snap-sandbox-id" placeholder="sb-xxxxxxxx">
<label>Snapshot Name (optional)</label>
<input type="text" id="snap-name" placeholder="auto-generated if empty">
<div class="btn-row">
<button class="btn-amber" onclick="createSnapshot()">Create Snapshot</button>
<label style="display:inline-flex;align-items:center;margin:0;font-size:12px;text-transform:none;letter-spacing:0">
<input type="checkbox" id="snap-overwrite" style="width:auto;margin-right:4px"> Overwrite
</label>
</div>
<h2>Snapshots / Templates</h2>
<div class="btn-row">
<button onclick="listSnapshots()">Refresh</button>
</div>
<div id="snapshots-table"></div>
</div>
<!-- Execute Command -->
<div class="panel">
<h2>Execute Command</h2>
<label>Sandbox ID</label>
<input type="text" id="exec-sandbox-id" placeholder="sb-xxxxxxxx">
<label>Command</label>
<input type="text" id="exec-cmd" value="/bin/sh" placeholder="/bin/sh">
<label>Args (comma separated)</label>
<input type="text" id="exec-args" value="-c,uname -a" placeholder="-c,echo hello">
<div class="btn-row">
<button class="btn-green" onclick="execCmd()">Run</button>
</div>
<div id="exec-output" class="exec-output" style="display:none"></div>
</div>
<!-- Activity Log -->
<div class="panel">
<h2>Activity Log</h2>
<div id="log" class="log"></div>
</div>
<!-- Sandboxes List -->
<div class="panel full">
<h2>Sandboxes</h2>
<div class="btn-row">
<button onclick="listSandboxes()">Refresh</button>
<label style="display:inline-flex;align-items:center;margin:0;font-size:12px;text-transform:none;letter-spacing:0">
<input type="checkbox" id="auto-refresh" style="width:auto;margin-right:4px"> Auto-refresh (5s)
</label>
</div>
<div id="sandboxes-table"></div>
</div>
</div>
<script>
const API = '';
function log(msg, level) {
const el = document.getElementById('log');
const t = new Date().toLocaleTimeString();
const cls = level === 'ok' ? 'log-ok' : level === 'err' ? 'log-err' : 'log-info';
el.innerHTML = '<div class="log-entry"><span class="log-time">' + t + '</span> <span class="' + cls + '">' + esc(msg) + '</span></div>' + el.innerHTML;
}
function esc(s) {
const d = document.createElement('div');
d.textContent = s;
return d.innerHTML;
}
async function api(method, path, body) {
const opts = { method, headers: {} };
if (body) {
opts.headers['Content-Type'] = 'application/json';
opts.body = JSON.stringify(body);
}
const resp = await fetch(API + path, opts);
if (resp.status === 204) return null;
const data = await resp.json();
if (resp.status >= 300) {
throw new Error(data.error ? data.error.message : resp.statusText);
}
return data;
}
function statusBadge(s) {
return '<span class="status status-' + s + '">' + s + '</span>';
}
// --- Sandboxes ---
async function listSandboxes() {
try {
const data = await api('GET', '/v1/sandboxes');
renderSandboxes(data);
} catch (e) {
log('List sandboxes failed: ' + e.message, 'err');
}
}
function renderSandboxes(sandboxes) {
if (!sandboxes || sandboxes.length === 0) {
document.getElementById('sandboxes-table').innerHTML = '<p style="color:#5f5c57;margin-top:8px">No sandboxes</p>';
return;
}
let html = '<table><thead><tr><th>ID</th><th>Status</th><th>Template</th><th>vCPUs</th><th>Mem</th><th>Host IP</th><th>Created</th><th>Actions</th></tr></thead><tbody>';
for (const sb of sandboxes) {
html += '<tr>';
html += '<td class="clickable" onclick="useSandbox(\'' + sb.id + '\')">' + sb.id + '</td>';
html += '<td>' + statusBadge(sb.status) + '</td>';
html += '<td>' + esc(sb.template) + '</td>';
html += '<td>' + sb.vcpus + '</td>';
html += '<td>' + sb.memory_mb + 'MB</td>';
html += '<td>' + (sb.host_ip || '-') + '</td>';
html += '<td>' + new Date(sb.created_at).toLocaleTimeString() + '</td>';
html += '<td><div class="btn-row">';
if (sb.status === 'running') {
html += '<button class="btn-amber" onclick="pauseSandbox(\'' + sb.id + '\')">Pause</button>';
html += '<button class="btn-red" onclick="destroySandbox(\'' + sb.id + '\')">Destroy</button>';
} else if (sb.status === 'paused') {
html += '<button class="btn-green" onclick="resumeSandbox(\'' + sb.id + '\')">Resume</button>';
html += '<button class="btn-red" onclick="destroySandbox(\'' + sb.id + '\')">Destroy</button>';
} else {
html += '<button class="btn-red" onclick="destroySandbox(\'' + sb.id + '\')">Destroy</button>';
}
html += '</div></td>';
html += '</tr>';
}
html += '</tbody></table>';
document.getElementById('sandboxes-table').innerHTML = html;
}
function useSandbox(id) {
document.getElementById('exec-sandbox-id').value = id;
document.getElementById('snap-sandbox-id').value = id;
}
async function createSandbox() {
const template = document.getElementById('create-template').value;
const vcpus = parseInt(document.getElementById('create-vcpus').value);
const memory_mb = parseInt(document.getElementById('create-memory').value);
const timeout_sec = parseInt(document.getElementById('create-timeout').value);
log('Creating sandbox (template=' + template + ', vcpus=' + vcpus + ', mem=' + memory_mb + 'MB)...', 'info');
try {
const data = await api('POST', '/v1/sandboxes', { template, vcpus, memory_mb, timeout_sec });
log('Created sandbox ' + data.id + ' [' + data.status + ']', 'ok');
listSandboxes();
} catch (e) {
log('Create failed: ' + e.message, 'err');
}
}
async function pauseSandbox(id) {
log('Pausing ' + id + '...', 'info');
try {
await api('POST', '/v1/sandboxes/' + id + '/pause');
log('Paused ' + id, 'ok');
listSandboxes();
} catch (e) {
log('Pause failed: ' + e.message, 'err');
}
}
async function resumeSandbox(id) {
log('Resuming ' + id + '...', 'info');
try {
await api('POST', '/v1/sandboxes/' + id + '/resume');
log('Resumed ' + id, 'ok');
listSandboxes();
} catch (e) {
log('Resume failed: ' + e.message, 'err');
}
}
async function destroySandbox(id) {
log('Destroying ' + id + '...', 'info');
try {
await api('DELETE', '/v1/sandboxes/' + id);
log('Destroyed ' + id, 'ok');
listSandboxes();
} catch (e) {
log('Destroy failed: ' + e.message, 'err');
}
}
// --- Exec ---
async function execCmd() {
const sandboxId = document.getElementById('exec-sandbox-id').value;
const cmd = document.getElementById('exec-cmd').value;
const argsStr = document.getElementById('exec-args').value;
const args = argsStr ? argsStr.split(',').map(s => s.trim()) : [];
if (!sandboxId) { log('No sandbox ID for exec', 'err'); return; }
const out = document.getElementById('exec-output');
out.style.display = 'block';
out.textContent = 'Running...';
log('Exec on ' + sandboxId + ': ' + cmd + ' ' + args.join(' '), 'info');
try {
const data = await api('POST', '/v1/sandboxes/' + sandboxId + '/exec', { cmd, args });
let text = '';
if (data.stdout) text += data.stdout;
if (data.stderr) text += '\n[stderr]\n' + data.stderr;
text += '\n[exit_code=' + data.exit_code + ', duration=' + data.duration_ms + 'ms]';
out.textContent = text;
log('Exec completed (exit=' + data.exit_code + ')', data.exit_code === 0 ? 'ok' : 'err');
} catch (e) {
out.textContent = 'Error: ' + e.message;
log('Exec failed: ' + e.message, 'err');
}
}
// --- Snapshots ---
async function createSnapshot() {
const sandbox_id = document.getElementById('snap-sandbox-id').value;
const name = document.getElementById('snap-name').value;
const overwrite = document.getElementById('snap-overwrite').checked;
if (!sandbox_id) { log('No sandbox ID for snapshot', 'err'); return; }
const body = { sandbox_id };
if (name) body.name = name;
const qs = overwrite ? '?overwrite=true' : '';
log('Creating snapshot from ' + sandbox_id + (name ? ' as "' + name + '"' : '') + '...', 'info');
try {
const data = await api('POST', '/v1/snapshots' + qs, body);
log('Snapshot created: ' + data.name + ' (' + (data.size_bytes / 1024 / 1024).toFixed(1) + 'MB)', 'ok');
listSnapshots();
listSandboxes();
} catch (e) {
log('Snapshot failed: ' + e.message, 'err');
}
}
async function listSnapshots() {
try {
const data = await api('GET', '/v1/snapshots');
renderSnapshots(data);
} catch (e) {
log('List snapshots failed: ' + e.message, 'err');
}
}
function renderSnapshots(snapshots) {
if (!snapshots || snapshots.length === 0) {
document.getElementById('snapshots-table').innerHTML = '<p style="color:#5f5c57;margin-top:8px">No snapshots</p>';
return;
}
let html = '<table><thead><tr><th>Name</th><th>Type</th><th>vCPUs</th><th>Mem</th><th>Size</th><th>Actions</th></tr></thead><tbody>';
for (const s of snapshots) {
html += '<tr>';
html += '<td class="clickable" onclick="useTemplate(\'' + esc(s.name) + '\')">' + esc(s.name) + '</td>';
html += '<td>' + s.type + '</td>';
html += '<td>' + (s.vcpus || '-') + '</td>';
html += '<td>' + (s.memory_mb ? s.memory_mb + 'MB' : '-') + '</td>';
html += '<td>' + (s.size_bytes / 1024 / 1024).toFixed(1) + 'MB</td>';
html += '<td><button class="btn-red" onclick="deleteSnapshot(\'' + esc(s.name) + '\')">Delete</button></td>';
html += '</tr>';
}
html += '</tbody></table>';
document.getElementById('snapshots-table').innerHTML = html;
}
function useTemplate(name) {
document.getElementById('create-template').value = name;
log('Template set to "' + name + '" — click Create to launch from this snapshot', 'info');
}
async function deleteSnapshot(name) {
log('Deleting snapshot "' + name + '"...', 'info');
try {
await api('DELETE', '/v1/snapshots/' + encodeURIComponent(name));
log('Deleted snapshot "' + name + '"', 'ok');
listSnapshots();
} catch (e) {
log('Delete snapshot failed: ' + e.message, 'err');
}
}
// --- Auto-refresh ---
let refreshInterval = null;
document.getElementById('auto-refresh').addEventListener('change', function() {
if (this.checked) {
refreshInterval = setInterval(listSandboxes, 5000);
} else {
clearInterval(refreshInterval);
refreshInterval = null;
}
});
// --- Init ---
listSandboxes();
listSnapshots();
</script>
</body>
</html>`

View File

@ -24,7 +24,7 @@ type errorDetail struct {
func writeJSON(w http.ResponseWriter, status int, v any) {
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(status)
json.NewEncoder(w).Encode(v)
_ = json.NewEncoder(w).Encode(v)
}
func writeError(w http.ResponseWriter, status int, code, message string) {

View File

@ -35,6 +35,9 @@ func New(queries *db.Queries, agent hostagentv1connect.HostAgentServiceClient) *
r.Get("/openapi.yaml", serveOpenAPI)
r.Get("/docs", serveDocs)
// Test UI for sandbox lifecycle management.
r.Get("/test", serveTestUI)
// Sandbox CRUD.
r.Route("/v1/sandboxes", func(r chi.Router) {
r.Post("/", sandbox.Create)
@ -71,7 +74,7 @@ func (s *Server) Handler() http.Handler {
func serveOpenAPI(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/yaml")
w.Write(openapiYAML)
_, _ = w.Write(openapiYAML)
}
func serveDocs(w http.ResponseWriter, r *http.Request) {

View File

@ -0,0 +1,317 @@
// Package devicemapper provides device-mapper snapshot operations for
// copy-on-write rootfs management. Each sandbox gets a dm-snapshot backed
// by a shared read-only loop device (the base template image) and a
// per-sandbox sparse CoW file that stores only modified blocks.
package devicemapper
import (
"fmt"
"log/slog"
"os"
"os/exec"
"strconv"
"strings"
"sync"
)
const (
// ChunkSize is the dm-snapshot chunk size in 512-byte sectors.
// 8 sectors = 4KB, matching the standard page/block size.
ChunkSize = 8
)
// loopEntry tracks a loop device and its reference count.
type loopEntry struct {
device string // e.g., /dev/loop0
refcount int
}
// LoopRegistry manages loop devices for base template images.
// Each unique image path gets one read-only loop device, shared
// across all sandboxes using that template. Reference counting
// ensures the loop device is released when no sandboxes use it.
type LoopRegistry struct {
mu sync.Mutex
entries map[string]*loopEntry // imagePath → loopEntry
}
// NewLoopRegistry creates a new loop device registry.
func NewLoopRegistry() *LoopRegistry {
return &LoopRegistry{
entries: make(map[string]*loopEntry),
}
}
// Acquire returns a read-only loop device for the given image path.
// If one already exists, its refcount is incremented. Otherwise a new
// loop device is created via losetup.
func (r *LoopRegistry) Acquire(imagePath string) (string, error) {
r.mu.Lock()
defer r.mu.Unlock()
if e, ok := r.entries[imagePath]; ok {
e.refcount++
slog.Debug("loop device reused", "image", imagePath, "device", e.device, "refcount", e.refcount)
return e.device, nil
}
dev, err := losetupCreate(imagePath)
if err != nil {
return "", fmt.Errorf("losetup %s: %w", imagePath, err)
}
r.entries[imagePath] = &loopEntry{device: dev, refcount: 1}
slog.Info("loop device created", "image", imagePath, "device", dev)
return dev, nil
}
// Release decrements the refcount for the given image path.
// When the refcount reaches zero, the loop device is detached.
func (r *LoopRegistry) Release(imagePath string) {
r.mu.Lock()
defer r.mu.Unlock()
e, ok := r.entries[imagePath]
if !ok {
return
}
e.refcount--
if e.refcount <= 0 {
if err := losetupDetach(e.device); err != nil {
slog.Warn("losetup detach failed", "device", e.device, "error", err)
}
delete(r.entries, imagePath)
slog.Info("loop device released", "image", imagePath, "device", e.device)
}
}
// ReleaseAll detaches all loop devices. Used during shutdown.
func (r *LoopRegistry) ReleaseAll() {
r.mu.Lock()
defer r.mu.Unlock()
for path, e := range r.entries {
if err := losetupDetach(e.device); err != nil {
slog.Warn("losetup detach failed", "device", e.device, "error", err)
}
delete(r.entries, path)
}
}
// SnapshotDevice holds the state for a single dm-snapshot device.
type SnapshotDevice struct {
Name string // dm device name, e.g., "wrenn-sb-a1b2c3d4"
DevicePath string // /dev/mapper/<Name>
CowPath string // path to the sparse CoW file
CowLoopDev string // loop device for the CoW file
}
// CreateSnapshot sets up a new dm-snapshot device.
//
// It creates a sparse CoW file, attaches it as a loop device, and creates
// a device-mapper snapshot target combining the read-only origin with the
// writable CoW layer.
//
// The origin loop device must already exist (from LoopRegistry.Acquire).
func CreateSnapshot(name, originLoopDev, cowPath string, originSizeBytes int64) (*SnapshotDevice, error) {
// Create sparse CoW file sized to match the origin.
if err := createSparseFile(cowPath, originSizeBytes); err != nil {
return nil, fmt.Errorf("create cow file: %w", err)
}
cowLoopDev, err := losetupCreateRW(cowPath)
if err != nil {
os.Remove(cowPath)
return nil, fmt.Errorf("losetup cow: %w", err)
}
sectors := originSizeBytes / 512
if err := dmsetupCreate(name, originLoopDev, cowLoopDev, sectors); err != nil {
if detachErr := losetupDetach(cowLoopDev); detachErr != nil {
slog.Warn("cow losetup detach failed during cleanup", "device", cowLoopDev, "error", detachErr)
}
os.Remove(cowPath)
return nil, fmt.Errorf("dmsetup create: %w", err)
}
devPath := "/dev/mapper/" + name
slog.Info("dm-snapshot created",
"name", name,
"device", devPath,
"origin", originLoopDev,
"cow", cowPath,
)
return &SnapshotDevice{
Name: name,
DevicePath: devPath,
CowPath: cowPath,
CowLoopDev: cowLoopDev,
}, nil
}
// RestoreSnapshot re-attaches a dm-snapshot from an existing persistent CoW file.
// The CoW file must have been created with the persistent (P) flag and still
// contain valid dm-snapshot metadata.
func RestoreSnapshot(name, originLoopDev, cowPath string, originSizeBytes int64) (*SnapshotDevice, error) {
cowLoopDev, err := losetupCreateRW(cowPath)
if err != nil {
return nil, fmt.Errorf("losetup cow: %w", err)
}
sectors := originSizeBytes / 512
if err := dmsetupCreate(name, originLoopDev, cowLoopDev, sectors); err != nil {
if detachErr := losetupDetach(cowLoopDev); detachErr != nil {
slog.Warn("cow losetup detach failed during cleanup", "device", cowLoopDev, "error", detachErr)
}
return nil, fmt.Errorf("dmsetup create: %w", err)
}
devPath := "/dev/mapper/" + name
slog.Info("dm-snapshot restored",
"name", name,
"device", devPath,
"origin", originLoopDev,
"cow", cowPath,
)
return &SnapshotDevice{
Name: name,
DevicePath: devPath,
CowPath: cowPath,
CowLoopDev: cowLoopDev,
}, nil
}
// RemoveSnapshot tears down a dm-snapshot device and its CoW loop device.
// The CoW file is NOT deleted — the caller decides whether to keep or remove it.
func RemoveSnapshot(dev *SnapshotDevice) error {
if err := dmsetupRemove(dev.Name); err != nil {
return fmt.Errorf("dmsetup remove %s: %w", dev.Name, err)
}
if err := losetupDetach(dev.CowLoopDev); err != nil {
slog.Warn("cow losetup detach failed", "device", dev.CowLoopDev, "error", err)
}
slog.Info("dm-snapshot removed", "name", dev.Name)
return nil
}
// FlattenSnapshot reads the full contents of a dm-snapshot device and writes
// it to a new file. This merges the base image + CoW changes into a standalone
// rootfs image suitable for use as a new template.
func FlattenSnapshot(dmDevPath, outputPath string) error {
cmd := exec.Command("dd",
"if="+dmDevPath,
"of="+outputPath,
"bs=4M",
"status=none",
)
if out, err := cmd.CombinedOutput(); err != nil {
os.Remove(outputPath)
return fmt.Errorf("dd flatten: %s: %w", string(out), err)
}
return nil
}
// OriginSizeBytes returns the size in bytes of a loop device's backing file.
func OriginSizeBytes(loopDev string) (int64, error) {
// blockdev --getsize64 returns size in bytes.
out, err := exec.Command("blockdev", "--getsize64", loopDev).CombinedOutput()
if err != nil {
return 0, fmt.Errorf("blockdev --getsize64 %s: %s: %w", loopDev, strings.TrimSpace(string(out)), err)
}
s := strings.TrimSpace(string(out))
return strconv.ParseInt(s, 10, 64)
}
// CleanupStaleDevices removes any device-mapper devices matching the
// "wrenn-" prefix that may have been left behind by a previous agent
// instance that crashed or was killed. Should be called at agent startup.
func CleanupStaleDevices() {
out, err := exec.Command("dmsetup", "ls", "--target", "snapshot").CombinedOutput()
if err != nil {
slog.Debug("dmsetup ls failed (may be normal if no devices exist)", "error", err)
return
}
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
if line == "" || line == "No devices found" {
continue
}
// dmsetup ls output format: "name\t(major:minor)"
name, _, _ := strings.Cut(line, "\t")
if !strings.HasPrefix(name, "wrenn-") {
continue
}
slog.Warn("removing stale dm-snapshot device", "name", name)
if err := dmsetupRemove(name); err != nil {
slog.Warn("failed to remove stale device", "name", name, "error", err)
}
}
}
// --- low-level helpers ---
// losetupCreate attaches a file as a read-only loop device.
func losetupCreate(imagePath string) (string, error) {
out, err := exec.Command("losetup", "--read-only", "--find", "--show", imagePath).Output()
if err != nil {
return "", fmt.Errorf("losetup --read-only: %w", err)
}
return strings.TrimSpace(string(out)), nil
}
// losetupCreateRW attaches a file as a read-write loop device.
func losetupCreateRW(path string) (string, error) {
out, err := exec.Command("losetup", "--find", "--show", path).Output()
if err != nil {
return "", fmt.Errorf("losetup: %w", err)
}
return strings.TrimSpace(string(out)), nil
}
// losetupDetach detaches a loop device.
func losetupDetach(dev string) error {
return exec.Command("losetup", "-d", dev).Run()
}
// dmsetupCreate creates a dm-snapshot device with persistent metadata.
func dmsetupCreate(name, originDev, cowDev string, sectors int64) error {
// Table format: <start> <size> snapshot <origin> <cow> P <chunk_size>
// P = persistent — CoW metadata survives dmsetup remove.
table := fmt.Sprintf("0 %d snapshot %s %s P %d", sectors, originDev, cowDev, ChunkSize)
cmd := exec.Command("dmsetup", "create", name, "--table", table)
if out, err := cmd.CombinedOutput(); err != nil {
return fmt.Errorf("%s: %w", strings.TrimSpace(string(out)), err)
}
return nil
}
// dmsetupRemove removes a device-mapper device.
func dmsetupRemove(name string) error {
cmd := exec.Command("dmsetup", "remove", name)
if out, err := cmd.CombinedOutput(); err != nil {
return fmt.Errorf("%s: %w", strings.TrimSpace(string(out)), err)
}
return nil
}
// createSparseFile creates a sparse file of the given size.
func createSparseFile(path string, sizeBytes int64) error {
f, err := os.Create(path)
if err != nil {
return err
}
if err := f.Truncate(sizeBytes); err != nil {
f.Close()
os.Remove(path)
return err
}
return f.Close()
}

View File

@ -1,16 +0,0 @@
package filesystem
import (
"fmt"
"os/exec"
)
// CloneRootfs creates a copy-on-write clone of the base rootfs image.
// Uses reflink if supported by the filesystem, falls back to regular copy.
func CloneRootfs(src, dst string) error {
cmd := exec.Command("cp", "--reflink=auto", src, dst)
if out, err := cmd.CombinedOutput(); err != nil {
return fmt.Errorf("cp --reflink=auto: %s: %w", string(out), err)
}
return nil
}

View File

@ -1 +0,0 @@
package filesystem

View File

@ -104,7 +104,7 @@ func CreateNetwork(slot *Slot) error {
return fmt.Errorf("get host namespace: %w", err)
}
defer hostNS.Close()
defer netns.Set(hostNS)
defer func() { _ = netns.Set(hostNS) }()
// Create named network namespace.
ns, err := netns.NewNamed(slot.NamespaceID)
@ -304,17 +304,17 @@ func RemoveNetwork(slot *Slot) error {
// Remove host-side iptables rules (best effort).
if defaultIface != "" {
iptablesHost(
_ = iptablesHost(
"-D", "FORWARD",
"-i", slot.VethName, "-o", defaultIface,
"-j", "ACCEPT",
)
iptablesHost(
_ = iptablesHost(
"-D", "FORWARD",
"-i", defaultIface, "-o", slot.VethName,
"-j", "ACCEPT",
)
iptablesHost(
_ = iptablesHost(
"-t", "nat", "-D", "POSTROUTING",
"-s", fmt.Sprintf("%s/32", slot.VpeerIP.String()),
"-o", defaultIface,
@ -324,18 +324,18 @@ func RemoveNetwork(slot *Slot) error {
// Remove host route.
_, hostNet, _ := net.ParseCIDR(fmt.Sprintf("%s/32", slot.HostIP.String()))
netlink.RouteDel(&netlink.Route{
_ = netlink.RouteDel(&netlink.Route{
Dst: hostNet,
Gw: slot.VpeerIP,
})
// Delete veth (also destroys the peer in the namespace).
if veth, err := netlink.LinkByName(slot.VethName); err == nil {
netlink.LinkDel(veth)
_ = netlink.LinkDel(veth)
}
// Delete the named namespace.
netns.DeleteNamed(slot.NamespaceID)
_ = netns.DeleteNamed(slot.NamespaceID)
slog.Info("network removed", "ns", slot.NamespaceID)

View File

@ -11,8 +11,8 @@ import (
"github.com/google/uuid"
"git.omukk.dev/wrenn/sandbox/internal/devicemapper"
"git.omukk.dev/wrenn/sandbox/internal/envdclient"
"git.omukk.dev/wrenn/sandbox/internal/filesystem"
"git.omukk.dev/wrenn/sandbox/internal/id"
"git.omukk.dev/wrenn/sandbox/internal/models"
"git.omukk.dev/wrenn/sandbox/internal/network"
@ -35,6 +35,7 @@ type Manager struct {
cfg Config
vm *vm.Manager
slots *network.SlotAllocator
loops *devicemapper.LoopRegistry
mu sync.RWMutex
boxes map[string]*sandboxState
stopCh chan struct{}
@ -46,6 +47,8 @@ type sandboxState struct {
slot *network.Slot
client *envdclient.Client
uffdSocketPath string // non-empty for sandboxes restored from snapshot
dmDevice *devicemapper.SnapshotDevice
baseImagePath string // path to the base template rootfs (for loop registry release)
}
// New creates a new sandbox manager.
@ -57,6 +60,7 @@ func New(cfg Config) *Manager {
cfg: cfg,
vm: vm.NewManager(),
slots: network.NewSlotAllocator(),
loops: devicemapper.NewLoopRegistry(),
boxes: make(map[string]*sandboxState),
stopCh: make(chan struct{}),
}
@ -91,16 +95,33 @@ func (m *Manager) Create(ctx context.Context, sandboxID, template string, vcpus,
return nil, fmt.Errorf("base rootfs not found at %s: %w", baseRootfs, err)
}
// Clone rootfs.
rootfsPath := filepath.Join(m.cfg.SandboxesDir, fmt.Sprintf("%s-%s.ext4", sandboxID, template))
if err := filesystem.CloneRootfs(baseRootfs, rootfsPath); err != nil {
return nil, fmt.Errorf("clone rootfs: %w", err)
// Acquire shared read-only loop device for the base image.
originLoop, err := m.loops.Acquire(baseRootfs)
if err != nil {
return nil, fmt.Errorf("acquire loop device: %w", err)
}
originSize, err := devicemapper.OriginSizeBytes(originLoop)
if err != nil {
m.loops.Release(baseRootfs)
return nil, fmt.Errorf("get origin size: %w", err)
}
// Create dm-snapshot with per-sandbox CoW file.
dmName := "wrenn-" + sandboxID
cowPath := filepath.Join(m.cfg.SandboxesDir, fmt.Sprintf("%s.cow", sandboxID))
dmDev, err := devicemapper.CreateSnapshot(dmName, originLoop, cowPath, originSize)
if err != nil {
m.loops.Release(baseRootfs)
return nil, fmt.Errorf("create dm-snapshot: %w", err)
}
// Allocate network slot.
slotIdx, err := m.slots.Allocate()
if err != nil {
os.Remove(rootfsPath)
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev))
os.Remove(cowPath)
m.loops.Release(baseRootfs)
return nil, fmt.Errorf("allocate network slot: %w", err)
}
slot := network.NewSlot(slotIdx)
@ -108,15 +129,17 @@ func (m *Manager) Create(ctx context.Context, sandboxID, template string, vcpus,
// Set up network.
if err := network.CreateNetwork(slot); err != nil {
m.slots.Release(slotIdx)
os.Remove(rootfsPath)
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev))
os.Remove(cowPath)
m.loops.Release(baseRootfs)
return nil, fmt.Errorf("create network: %w", err)
}
// Boot VM.
// Boot VM — Firecracker gets the dm device path.
vmCfg := vm.VMConfig{
SandboxID: sandboxID,
KernelPath: m.cfg.KernelPath,
RootfsPath: rootfsPath,
RootfsPath: dmDev.DevicePath,
VCPUs: vcpus,
MemoryMB: memoryMB,
NetworkNamespace: slot.NamespaceID,
@ -128,9 +151,11 @@ func (m *Manager) Create(ctx context.Context, sandboxID, template string, vcpus,
}
if _, err := m.vm.Create(ctx, vmCfg); err != nil {
network.RemoveNetwork(slot)
warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot))
m.slots.Release(slotIdx)
os.Remove(rootfsPath)
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev))
os.Remove(cowPath)
m.loops.Release(baseRootfs)
return nil, fmt.Errorf("create VM: %w", err)
}
@ -140,10 +165,12 @@ func (m *Manager) Create(ctx context.Context, sandboxID, template string, vcpus,
defer waitCancel()
if err := client.WaitUntilReady(waitCtx); err != nil {
m.vm.Destroy(context.Background(), sandboxID)
network.RemoveNetwork(slot)
warnErr("vm destroy error", sandboxID, m.vm.Destroy(context.Background(), sandboxID))
warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot))
m.slots.Release(slotIdx)
os.Remove(rootfsPath)
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev))
os.Remove(cowPath)
m.loops.Release(baseRootfs)
return nil, fmt.Errorf("wait for envd: %w", err)
}
@ -158,12 +185,14 @@ func (m *Manager) Create(ctx context.Context, sandboxID, template string, vcpus,
TimeoutSec: timeoutSec,
SlotIndex: slotIdx,
HostIP: slot.HostIP,
RootfsPath: rootfsPath,
RootfsPath: dmDev.DevicePath,
CreatedAt: now,
LastActiveAt: now,
},
slot: slot,
client: client,
slot: slot,
client: client,
dmDevice: dmDev,
baseImagePath: baseRootfs,
}
m.mu.Lock()
@ -174,6 +203,7 @@ func (m *Manager) Create(ctx context.Context, sandboxID, template string, vcpus,
"id", sandboxID,
"template", template,
"host_ip", slot.HostIP.String(),
"dm_device", dmDev.DevicePath,
)
return &sb.Sandbox, nil
@ -194,7 +224,7 @@ func (m *Manager) Destroy(ctx context.Context, sandboxID string) error {
}
// Always clean up pause snapshot files (may exist if sandbox was paused).
snapshot.Remove(m.cfg.SnapshotsDir, sandboxID)
warnErr("snapshot cleanup error", sandboxID, snapshot.Remove(m.cfg.SnapshotsDir, sandboxID))
slog.Info("sandbox destroyed", "id", sandboxID)
return nil
@ -209,7 +239,18 @@ func (m *Manager) cleanup(ctx context.Context, sb *sandboxState) {
slog.Warn("network cleanup error", "id", sb.ID, "error", err)
}
m.slots.Release(sb.SlotIndex)
os.Remove(sb.RootfsPath)
// Tear down dm-snapshot and release the base image loop device.
if sb.dmDevice != nil {
if err := devicemapper.RemoveSnapshot(sb.dmDevice); err != nil {
slog.Warn("dm-snapshot remove error", "id", sb.ID, "error", err)
}
os.Remove(sb.dmDevice.CowPath)
}
if sb.baseImagePath != "" {
m.loops.Release(sb.baseImagePath)
}
if sb.uffdSocketPath != "" {
os.Remove(sb.uffdSocketPath)
}
@ -228,12 +269,15 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error {
return fmt.Errorf("sandbox %s is not running (status: %s)", sandboxID, sb.Status)
}
pauseStart := time.Now()
// Step 1: Pause the VM (freeze vCPUs).
if err := m.vm.Pause(ctx, sandboxID); err != nil {
return fmt.Errorf("pause VM: %w", err)
}
slog.Debug("pause: VM paused", "id", sandboxID, "elapsed", time.Since(pauseStart))
// Step 2: Take a full snapshot (snapfile + memfile).
// Step 2: Take VM state snapshot (snapfile + memfile) — CoW file is saved separately.
if err := snapshot.EnsureDir(m.cfg.SnapshotsDir, sandboxID); err != nil {
return fmt.Errorf("create snapshot dir: %w", err)
}
@ -242,39 +286,74 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error {
rawMemPath := filepath.Join(snapDir, "memfile.raw")
snapPath := snapshot.SnapPath(m.cfg.SnapshotsDir, sandboxID)
// For UFFD-resumed sandboxes, FC must fault in ALL lazy-loaded pages to
// serialize memory — this is the main bottleneck on re-pause.
snapshotStart := time.Now()
if err := m.vm.Snapshot(ctx, sandboxID, snapPath, rawMemPath); err != nil {
snapshot.Remove(m.cfg.SnapshotsDir, sandboxID)
warnErr("snapshot dir cleanup error", sandboxID, snapshot.Remove(m.cfg.SnapshotsDir, sandboxID))
return fmt.Errorf("create VM snapshot: %w", err)
}
slog.Debug("pause: FC snapshot created", "id", sandboxID, "elapsed", time.Since(snapshotStart))
// Step 3: Process the raw memfile into a compact diff + header.
buildID := uuid.New()
diffPath := snapshot.MemDiffPath(m.cfg.SnapshotsDir, sandboxID)
headerPath := snapshot.MemHeaderPath(m.cfg.SnapshotsDir, sandboxID)
processStart := time.Now()
if _, err := snapshot.ProcessMemfile(rawMemPath, diffPath, headerPath, buildID); err != nil {
snapshot.Remove(m.cfg.SnapshotsDir, sandboxID)
warnErr("snapshot dir cleanup error", sandboxID, snapshot.Remove(m.cfg.SnapshotsDir, sandboxID))
return fmt.Errorf("process memfile: %w", err)
}
slog.Debug("pause: memfile processed", "id", sandboxID, "elapsed", time.Since(processStart))
// Remove the raw memfile — we only keep the compact diff.
os.Remove(rawMemPath)
// Step 4: Copy rootfs into snapshot dir.
snapshotRootfs := snapshot.RootfsPath(m.cfg.SnapshotsDir, sandboxID)
if err := filesystem.CloneRootfs(sb.RootfsPath, snapshotRootfs); err != nil {
snapshot.Remove(m.cfg.SnapshotsDir, sandboxID)
return fmt.Errorf("copy rootfs: %w", err)
// Step 4: Destroy the VM first so Firecracker releases the dm device.
if err := m.vm.Destroy(ctx, sb.ID); err != nil {
slog.Warn("vm destroy error during pause", "id", sb.ID, "error", err)
}
// Step 5: Now that FC is gone, safely remove the dm-snapshot and save the CoW.
if sb.dmDevice != nil {
if err := devicemapper.RemoveSnapshot(sb.dmDevice); err != nil {
warnErr("dm-snapshot remove error during pause", sandboxID, err)
}
// Move (not copy) the CoW file into the snapshot directory.
snapshotCow := snapshot.CowPath(m.cfg.SnapshotsDir, sandboxID)
if err := os.Rename(sb.dmDevice.CowPath, snapshotCow); err != nil {
warnErr("snapshot dir cleanup error", sandboxID, snapshot.Remove(m.cfg.SnapshotsDir, sandboxID))
return fmt.Errorf("move cow file: %w", err)
}
// Record which base template this CoW was built against.
if err := snapshot.WriteMeta(m.cfg.SnapshotsDir, sandboxID, &snapshot.RootfsMeta{
BaseTemplate: sb.baseImagePath,
}); err != nil {
warnErr("snapshot dir cleanup error", sandboxID, snapshot.Remove(m.cfg.SnapshotsDir, sandboxID))
return fmt.Errorf("write rootfs meta: %w", err)
}
}
// Step 6: Clean up remaining resources (network, loop device, uffd socket).
if err := network.RemoveNetwork(sb.slot); err != nil {
slog.Warn("network cleanup error during pause", "id", sb.ID, "error", err)
}
m.slots.Release(sb.SlotIndex)
if sb.baseImagePath != "" {
m.loops.Release(sb.baseImagePath)
}
if sb.uffdSocketPath != "" {
os.Remove(sb.uffdSocketPath)
}
// Step 5: Destroy the sandbox (free VM, network, rootfs clone).
m.mu.Lock()
delete(m.boxes, sandboxID)
m.mu.Unlock()
m.cleanup(ctx, sb)
slog.Info("sandbox paused (snapshot + destroy)", "id", sandboxID)
slog.Info("sandbox paused", "id", sandboxID, "total_elapsed", time.Since(pauseStart))
return nil
}
@ -307,19 +386,62 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string) (*models.Sandbox
return nil, fmt.Errorf("create memory source: %w", err)
}
// Clone snapshot rootfs for this sandbox.
snapshotRootfs := snapshot.RootfsPath(snapDir, sandboxID)
rootfsPath := filepath.Join(m.cfg.SandboxesDir, fmt.Sprintf("%s-resume.ext4", sandboxID))
if err := filesystem.CloneRootfs(snapshotRootfs, rootfsPath); err != nil {
// Read rootfs metadata to find the base template image.
meta, err := snapshot.ReadMeta(snapDir, sandboxID)
if err != nil {
source.Close()
return nil, fmt.Errorf("clone snapshot rootfs: %w", err)
return nil, fmt.Errorf("read rootfs meta: %w", err)
}
// Acquire the base image loop device and restore dm-snapshot from saved CoW.
baseImagePath := meta.BaseTemplate
originLoop, err := m.loops.Acquire(baseImagePath)
if err != nil {
source.Close()
return nil, fmt.Errorf("acquire loop device: %w", err)
}
originSize, err := devicemapper.OriginSizeBytes(originLoop)
if err != nil {
source.Close()
m.loops.Release(baseImagePath)
return nil, fmt.Errorf("get origin size: %w", err)
}
// Move CoW file from snapshot dir to sandboxes dir for the running sandbox.
savedCow := snapshot.CowPath(snapDir, sandboxID)
cowPath := filepath.Join(m.cfg.SandboxesDir, fmt.Sprintf("%s.cow", sandboxID))
if err := os.Rename(savedCow, cowPath); err != nil {
source.Close()
m.loops.Release(baseImagePath)
return nil, fmt.Errorf("move cow file: %w", err)
}
// rollbackCow attempts to move the CoW file back to the snapshot dir.
// Best-effort — logs a warning if it fails.
rollbackCow := func() {
if err := os.Rename(cowPath, savedCow); err != nil {
slog.Warn("failed to rollback cow file", "src", cowPath, "dst", savedCow, "error", err)
}
}
// Restore dm-snapshot from existing persistent CoW file.
dmName := "wrenn-" + sandboxID
dmDev, err := devicemapper.RestoreSnapshot(dmName, originLoop, cowPath, originSize)
if err != nil {
source.Close()
m.loops.Release(baseImagePath)
rollbackCow()
return nil, fmt.Errorf("restore dm-snapshot: %w", err)
}
// Allocate network slot.
slotIdx, err := m.slots.Allocate()
if err != nil {
source.Close()
os.Remove(rootfsPath)
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev))
rollbackCow()
m.loops.Release(baseImagePath)
return nil, fmt.Errorf("allocate network slot: %w", err)
}
slot := network.NewSlot(slotIdx)
@ -327,7 +449,9 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string) (*models.Sandbox
if err := network.CreateNetwork(slot); err != nil {
source.Close()
m.slots.Release(slotIdx)
os.Remove(rootfsPath)
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev))
rollbackCow()
m.loops.Release(baseImagePath)
return nil, fmt.Errorf("create network: %w", err)
}
@ -337,9 +461,11 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string) (*models.Sandbox
uffdServer := uffd.NewServer(uffdSocketPath, source)
if err := uffdServer.Start(ctx); err != nil {
source.Close()
network.RemoveNetwork(slot)
warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot))
m.slots.Release(slotIdx)
os.Remove(rootfsPath)
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev))
rollbackCow()
m.loops.Release(baseImagePath)
return nil, fmt.Errorf("start uffd server: %w", err)
}
@ -347,7 +473,7 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string) (*models.Sandbox
vmCfg := vm.VMConfig{
SandboxID: sandboxID,
KernelPath: m.cfg.KernelPath,
RootfsPath: rootfsPath,
RootfsPath: dmDev.DevicePath,
VCPUs: int(header.Metadata.Size / (1024 * 1024)), // Will be overridden by snapshot.
MemoryMB: int(header.Metadata.Size / (1024 * 1024)),
NetworkNamespace: slot.NamespaceID,
@ -360,11 +486,13 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string) (*models.Sandbox
snapPath := snapshot.SnapPath(snapDir, sandboxID)
if _, err := m.vm.CreateFromSnapshot(ctx, vmCfg, snapPath, uffdSocketPath); err != nil {
uffdServer.Stop()
warnErr("uffd server stop error", sandboxID, uffdServer.Stop())
source.Close()
network.RemoveNetwork(slot)
warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot))
m.slots.Release(slotIdx)
os.Remove(rootfsPath)
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev))
rollbackCow()
m.loops.Release(baseImagePath)
return nil, fmt.Errorf("restore VM from snapshot: %w", err)
}
@ -374,12 +502,14 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string) (*models.Sandbox
defer waitCancel()
if err := client.WaitUntilReady(waitCtx); err != nil {
uffdServer.Stop()
warnErr("uffd server stop error", sandboxID, uffdServer.Stop())
source.Close()
m.vm.Destroy(context.Background(), sandboxID)
network.RemoveNetwork(slot)
warnErr("vm destroy error", sandboxID, m.vm.Destroy(context.Background(), sandboxID))
warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot))
m.slots.Release(slotIdx)
os.Remove(rootfsPath)
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev))
os.Remove(cowPath)
m.loops.Release(baseImagePath)
return nil, fmt.Errorf("wait for envd: %w", err)
}
@ -394,25 +524,29 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string) (*models.Sandbox
TimeoutSec: 0,
SlotIndex: slotIdx,
HostIP: slot.HostIP,
RootfsPath: rootfsPath,
RootfsPath: dmDev.DevicePath,
CreatedAt: now,
LastActiveAt: now,
},
slot: slot,
client: client,
uffdSocketPath: uffdSocketPath,
dmDevice: dmDev,
baseImagePath: baseImagePath,
}
m.mu.Lock()
m.boxes[sandboxID] = sb
m.mu.Unlock()
// Clean up the snapshot files now that the sandbox is running.
snapshot.Remove(snapDir, sandboxID)
// Clean up remaining snapshot files (snapfile, memfile, header, meta).
// The CoW file was already moved out.
warnErr("snapshot cleanup error", sandboxID, snapshot.Remove(snapDir, sandboxID))
slog.Info("sandbox resumed from snapshot",
"id", sandboxID,
"host_ip", slot.HostIP.String(),
"dm_device", dmDev.DevicePath,
)
return &sb.Sandbox, nil
@ -421,7 +555,10 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string) (*models.Sandbox
// CreateSnapshot creates a reusable template from a sandbox. Works on both
// running and paused sandboxes. If the sandbox is running, it is paused first.
// The sandbox remains paused after this call (it can still be resumed).
// The template files are copied to ImagesDir/{name}/.
//
// The rootfs is flattened (base + CoW merged) into a new standalone rootfs.ext4
// so the template has no dependency on the original base image. Memory state
// and VM snapshot files are copied as-is.
func (m *Manager) CreateSnapshot(ctx context.Context, sandboxID, name string) (int64, error) {
// If the sandbox is running, pause it first.
if _, err := m.get(sandboxID); err == nil {
@ -435,29 +572,73 @@ func (m *Manager) CreateSnapshot(ctx context.Context, sandboxID, name string) (i
return 0, fmt.Errorf("no snapshot found for sandbox %s", sandboxID)
}
// Copy snapshot files to ImagesDir/{name}/ as a reusable template.
// Create template directory.
if err := snapshot.EnsureDir(m.cfg.ImagesDir, name); err != nil {
return 0, fmt.Errorf("create template dir: %w", err)
}
// Copy VM snapshot and memory files.
srcDir := snapshot.DirPath(m.cfg.SnapshotsDir, sandboxID)
dstDir := snapshot.DirPath(m.cfg.ImagesDir, name)
for _, fname := range []string{snapshot.SnapFileName, snapshot.MemDiffName, snapshot.MemHeaderName, snapshot.RootfsFileName} {
for _, fname := range []string{snapshot.SnapFileName, snapshot.MemDiffName, snapshot.MemHeaderName} {
src := filepath.Join(srcDir, fname)
dst := filepath.Join(dstDir, fname)
if err := filesystem.CloneRootfs(src, dst); err != nil {
snapshot.Remove(m.cfg.ImagesDir, name)
if err := copyFile(src, dst); err != nil {
warnErr("template dir cleanup error", name, snapshot.Remove(m.cfg.ImagesDir, name))
return 0, fmt.Errorf("copy %s: %w", fname, err)
}
}
// Flatten rootfs: temporarily set up dm device from base + CoW, dd to new image.
meta, err := snapshot.ReadMeta(m.cfg.SnapshotsDir, sandboxID)
if err != nil {
warnErr("template dir cleanup error", name, snapshot.Remove(m.cfg.ImagesDir, name))
return 0, fmt.Errorf("read rootfs meta: %w", err)
}
originLoop, err := m.loops.Acquire(meta.BaseTemplate)
if err != nil {
warnErr("template dir cleanup error", name, snapshot.Remove(m.cfg.ImagesDir, name))
return 0, fmt.Errorf("acquire loop device for flatten: %w", err)
}
originSize, err := devicemapper.OriginSizeBytes(originLoop)
if err != nil {
m.loops.Release(meta.BaseTemplate)
warnErr("template dir cleanup error", name, snapshot.Remove(m.cfg.ImagesDir, name))
return 0, fmt.Errorf("get origin size: %w", err)
}
// Temporarily restore the dm-snapshot to read the merged view.
cowPath := snapshot.CowPath(m.cfg.SnapshotsDir, sandboxID)
tmpDmName := "wrenn-flatten-" + sandboxID
tmpDev, err := devicemapper.RestoreSnapshot(tmpDmName, originLoop, cowPath, originSize)
if err != nil {
m.loops.Release(meta.BaseTemplate)
warnErr("template dir cleanup error", name, snapshot.Remove(m.cfg.ImagesDir, name))
return 0, fmt.Errorf("restore dm-snapshot for flatten: %w", err)
}
// Flatten to new standalone rootfs.
flattenedPath := snapshot.RootfsPath(m.cfg.ImagesDir, name)
flattenErr := devicemapper.FlattenSnapshot(tmpDev.DevicePath, flattenedPath)
// Always clean up the temporary dm device.
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(tmpDev))
m.loops.Release(meta.BaseTemplate)
if flattenErr != nil {
warnErr("template dir cleanup error", name, snapshot.Remove(m.cfg.ImagesDir, name))
return 0, fmt.Errorf("flatten rootfs: %w", flattenErr)
}
sizeBytes, err := snapshot.DirSize(m.cfg.ImagesDir, name)
if err != nil {
slog.Warn("failed to calculate snapshot size", "error", err)
}
slog.Info("snapshot created",
slog.Info("template snapshot created (rootfs flattened)",
"sandbox", sandboxID,
"name", name,
"size_bytes", sizeBytes,
@ -472,7 +653,9 @@ func (m *Manager) DeleteSnapshot(name string) error {
// createFromSnapshot creates a new sandbox by restoring from a snapshot template
// in ImagesDir/{snapshotName}/. Uses UFFD for lazy memory loading.
func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID, snapshotName string, vcpus, memoryMB, timeoutSec int) (*models.Sandbox, error) {
// The template's rootfs.ext4 is a flattened standalone image — we create a
// dm-snapshot on top of it just like a normal Create.
func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID, snapshotName string, vcpus, _, timeoutSec int) (*models.Sandbox, error) {
imagesDir := m.cfg.ImagesDir
// Read the header.
@ -486,10 +669,8 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID, snapshotNam
return nil, fmt.Errorf("deserialize header: %w", err)
}
// Snapshot determines memory size. VCPUs are also baked into the
// snapshot state — the caller should pass the correct value from
// the template DB record.
memoryMB = int(header.Metadata.Size / (1024 * 1024))
// Snapshot determines memory size.
memoryMB := int(header.Metadata.Size / (1024 * 1024))
// Build diff file map.
diffPaths := map[string]string{
@ -501,19 +682,37 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID, snapshotNam
return nil, fmt.Errorf("create memory source: %w", err)
}
// Clone snapshot rootfs.
snapshotRootfs := snapshot.RootfsPath(imagesDir, snapshotName)
rootfsPath := filepath.Join(m.cfg.SandboxesDir, fmt.Sprintf("%s-%s.ext4", sandboxID, snapshotName))
if err := filesystem.CloneRootfs(snapshotRootfs, rootfsPath); err != nil {
// Set up dm-snapshot on the template's flattened rootfs.
baseRootfs := snapshot.RootfsPath(imagesDir, snapshotName)
originLoop, err := m.loops.Acquire(baseRootfs)
if err != nil {
source.Close()
return nil, fmt.Errorf("clone snapshot rootfs: %w", err)
return nil, fmt.Errorf("acquire loop device: %w", err)
}
originSize, err := devicemapper.OriginSizeBytes(originLoop)
if err != nil {
source.Close()
m.loops.Release(baseRootfs)
return nil, fmt.Errorf("get origin size: %w", err)
}
dmName := "wrenn-" + sandboxID
cowPath := filepath.Join(m.cfg.SandboxesDir, fmt.Sprintf("%s.cow", sandboxID))
dmDev, err := devicemapper.CreateSnapshot(dmName, originLoop, cowPath, originSize)
if err != nil {
source.Close()
m.loops.Release(baseRootfs)
return nil, fmt.Errorf("create dm-snapshot: %w", err)
}
// Allocate network.
slotIdx, err := m.slots.Allocate()
if err != nil {
source.Close()
os.Remove(rootfsPath)
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev))
os.Remove(cowPath)
m.loops.Release(baseRootfs)
return nil, fmt.Errorf("allocate network slot: %w", err)
}
slot := network.NewSlot(slotIdx)
@ -521,7 +720,9 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID, snapshotNam
if err := network.CreateNetwork(slot); err != nil {
source.Close()
m.slots.Release(slotIdx)
os.Remove(rootfsPath)
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev))
os.Remove(cowPath)
m.loops.Release(baseRootfs)
return nil, fmt.Errorf("create network: %w", err)
}
@ -531,9 +732,11 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID, snapshotNam
uffdServer := uffd.NewServer(uffdSocketPath, source)
if err := uffdServer.Start(ctx); err != nil {
source.Close()
network.RemoveNetwork(slot)
warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot))
m.slots.Release(slotIdx)
os.Remove(rootfsPath)
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev))
os.Remove(cowPath)
m.loops.Release(baseRootfs)
return nil, fmt.Errorf("start uffd server: %w", err)
}
@ -541,7 +744,7 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID, snapshotNam
vmCfg := vm.VMConfig{
SandboxID: sandboxID,
KernelPath: m.cfg.KernelPath,
RootfsPath: rootfsPath,
RootfsPath: dmDev.DevicePath,
VCPUs: vcpus,
MemoryMB: memoryMB,
NetworkNamespace: slot.NamespaceID,
@ -554,11 +757,13 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID, snapshotNam
snapPath := snapshot.SnapPath(imagesDir, snapshotName)
if _, err := m.vm.CreateFromSnapshot(ctx, vmCfg, snapPath, uffdSocketPath); err != nil {
uffdServer.Stop()
warnErr("uffd server stop error", sandboxID, uffdServer.Stop())
source.Close()
network.RemoveNetwork(slot)
warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot))
m.slots.Release(slotIdx)
os.Remove(rootfsPath)
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev))
os.Remove(cowPath)
m.loops.Release(baseRootfs)
return nil, fmt.Errorf("restore VM from snapshot: %w", err)
}
@ -568,12 +773,14 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID, snapshotNam
defer waitCancel()
if err := client.WaitUntilReady(waitCtx); err != nil {
uffdServer.Stop()
warnErr("uffd server stop error", sandboxID, uffdServer.Stop())
source.Close()
m.vm.Destroy(context.Background(), sandboxID)
network.RemoveNetwork(slot)
warnErr("vm destroy error", sandboxID, m.vm.Destroy(context.Background(), sandboxID))
warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot))
m.slots.Release(slotIdx)
os.Remove(rootfsPath)
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev))
os.Remove(cowPath)
m.loops.Release(baseRootfs)
return nil, fmt.Errorf("wait for envd: %w", err)
}
@ -588,13 +795,15 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID, snapshotNam
TimeoutSec: timeoutSec,
SlotIndex: slotIdx,
HostIP: slot.HostIP,
RootfsPath: rootfsPath,
RootfsPath: dmDev.DevicePath,
CreatedAt: now,
LastActiveAt: now,
},
slot: slot,
client: client,
uffdSocketPath: uffdSocketPath,
dmDevice: dmDev,
baseImagePath: baseRootfs,
}
m.mu.Lock()
@ -605,6 +814,7 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID, snapshotNam
"id", sandboxID,
"snapshot", snapshotName,
"host_ip", slot.HostIP.String(),
"dm_device", dmDev.DevicePath,
)
return &sb.Sandbox, nil
@ -735,7 +945,7 @@ func (m *Manager) reapExpired(ctx context.Context) {
}
}
// Shutdown destroys all sandboxes and stops the TTL reaper.
// Shutdown destroys all sandboxes, releases loop devices, and stops the TTL reaper.
func (m *Manager) Shutdown(ctx context.Context) {
close(m.stopCh)
@ -752,4 +962,35 @@ func (m *Manager) Shutdown(ctx context.Context) {
slog.Warn("shutdown destroy failed", "id", sbID, "error", err)
}
}
m.loops.ReleaseAll()
}
// warnErr logs a warning if err is non-nil. Used for best-effort cleanup
// in error paths where the primary error has already been captured.
func warnErr(msg string, id string, err error) {
if err != nil {
slog.Warn(msg, "id", id, "error", err)
}
}
// copyFile copies a regular file from src to dst using streaming I/O.
func copyFile(src, dst string) error {
sf, err := os.Open(src)
if err != nil {
return fmt.Errorf("open %s: %w", src, err)
}
defer sf.Close()
df, err := os.Create(dst)
if err != nil {
return fmt.Errorf("create %s: %w", dst, err)
}
defer df.Close()
if _, err := df.ReadFrom(sf); err != nil {
os.Remove(dst)
return fmt.Errorf("copy %s → %s: %w", src, dst, err)
}
return nil
}

View File

@ -1,10 +1,12 @@
package snapshot
import (
"encoding/json"
"fmt"
"io/fs"
"os"
"path/filepath"
"syscall"
)
const (
@ -12,6 +14,8 @@ const (
MemDiffName = "memfile"
MemHeaderName = "memfile.header"
RootfsFileName = "rootfs.ext4"
RootfsCowName = "rootfs.cow"
RootfsMetaName = "rootfs.meta"
)
// DirPath returns the snapshot directory for a given name.
@ -39,15 +43,68 @@ func RootfsPath(baseDir, name string) string {
return filepath.Join(DirPath(baseDir, name), RootfsFileName)
}
// CowPath returns the path to the rootfs CoW diff file.
func CowPath(baseDir, name string) string {
return filepath.Join(DirPath(baseDir, name), RootfsCowName)
}
// MetaPath returns the path to the rootfs metadata file.
func MetaPath(baseDir, name string) string {
return filepath.Join(DirPath(baseDir, name), RootfsMetaName)
}
// RootfsMeta records which base template a CoW file was created against.
type RootfsMeta struct {
BaseTemplate string `json:"base_template"`
}
// WriteMeta writes rootfs metadata to the snapshot directory.
func WriteMeta(baseDir, name string, meta *RootfsMeta) error {
data, err := json.Marshal(meta)
if err != nil {
return fmt.Errorf("marshal rootfs meta: %w", err)
}
if err := os.WriteFile(MetaPath(baseDir, name), data, 0644); err != nil {
return fmt.Errorf("write rootfs meta: %w", err)
}
return nil
}
// ReadMeta reads rootfs metadata from the snapshot directory.
func ReadMeta(baseDir, name string) (*RootfsMeta, error) {
data, err := os.ReadFile(MetaPath(baseDir, name))
if err != nil {
return nil, fmt.Errorf("read rootfs meta: %w", err)
}
var meta RootfsMeta
if err := json.Unmarshal(data, &meta); err != nil {
return nil, fmt.Errorf("unmarshal rootfs meta: %w", err)
}
return &meta, nil
}
// Exists reports whether a complete snapshot exists (all required files present).
// Supports both legacy (rootfs.ext4) and CoW-based (rootfs.cow + rootfs.meta) snapshots.
func Exists(baseDir, name string) bool {
dir := DirPath(baseDir, name)
for _, f := range []string{SnapFileName, MemDiffName, MemHeaderName, RootfsFileName} {
// Common files required by both formats.
for _, f := range []string{SnapFileName, MemDiffName, MemHeaderName} {
if _, err := os.Stat(filepath.Join(dir, f)); err != nil {
return false
}
}
return true
// Accept either rootfs.ext4 (legacy) or rootfs.cow + rootfs.meta (dm-snapshot).
if _, err := os.Stat(filepath.Join(dir, RootfsFileName)); err == nil {
return true
}
if _, err := os.Stat(filepath.Join(dir, RootfsCowName)); err == nil {
if _, err := os.Stat(filepath.Join(dir, RootfsMetaName)); err == nil {
return true
}
}
return false
}
// IsTemplate reports whether a template image directory exists (has rootfs.ext4).
@ -61,6 +118,15 @@ func IsSnapshot(baseDir, name string) bool {
return Exists(baseDir, name)
}
// HasCow reports whether a snapshot uses CoW format (rootfs.cow + rootfs.meta)
// as opposed to legacy full rootfs (rootfs.ext4).
func HasCow(baseDir, name string) bool {
dir := DirPath(baseDir, name)
_, cowErr := os.Stat(filepath.Join(dir, RootfsCowName))
_, metaErr := os.Stat(filepath.Join(dir, RootfsMetaName))
return cowErr == nil && metaErr == nil
}
// EnsureDir creates the snapshot directory if it doesn't exist.
func EnsureDir(baseDir, name string) error {
dir := DirPath(baseDir, name)
@ -75,12 +141,14 @@ func Remove(baseDir, name string) error {
return os.RemoveAll(DirPath(baseDir, name))
}
// DirSize returns the total byte size of all files in the snapshot directory.
// DirSize returns the actual disk usage of all files in the snapshot directory.
// Uses block-based accounting (stat.Blocks * 512) so sparse files report only
// the blocks that are actually allocated, not their apparent size.
func DirSize(baseDir, name string) (int64, error) {
var total int64
dir := DirPath(baseDir, name)
err := filepath.WalkDir(dir, func(_ string, d fs.DirEntry, err error) error {
err := filepath.WalkDir(dir, func(path string, d fs.DirEntry, err error) error {
if err != nil {
return err
}
@ -91,7 +159,13 @@ func DirSize(baseDir, name string) (int64, error) {
if err != nil {
return err
}
total += info.Size()
if sys, ok := info.Sys().(*syscall.Stat_t); ok {
// Blocks is in 512-byte units regardless of filesystem block size.
total += sys.Blocks * 512
} else {
// Fallback to apparent size if syscall stat is unavailable.
total += info.Size()
}
return nil
})
if err != nil {

View File

@ -115,7 +115,7 @@ func (s *Server) Ready() <-chan struct{} {
// Stop signals the UFFD poll loop to exit and waits for it to finish.
func (s *Server) Stop() error {
// Write a byte to the exit pipe to wake the poll loop.
s.exitW.Write([]byte{0})
_, _ = s.exitW.Write([]byte{0})
<-s.doneCh
return s.doneErr
}

View File

@ -10,8 +10,8 @@ type VMConfig struct {
// KernelPath is the path to the uncompressed Linux kernel (vmlinux).
KernelPath string
// RootfsPath is the path to the ext4 rootfs image for this sandbox.
// This should be a per-sandbox copy (reflink clone of the base image).
// RootfsPath is the path to the rootfs block device for this sandbox.
// Typically a dm-snapshot device (e.g., /dev/mapper/wrenn-sb-a1b2c3d4).
RootfsPath string
// VCPUs is the number of virtual CPUs to allocate (default: 1).

View File

@ -131,15 +131,6 @@ func (c *fcClient) createSnapshot(ctx context.Context, snapPath, memPath string)
})
}
// loadSnapshot loads a VM snapshot from a file-backed memory image.
func (c *fcClient) loadSnapshot(ctx context.Context, snapPath, memPath string) error {
return c.do(ctx, http.MethodPut, "/snapshot/load", map[string]any{
"snapshot_path": snapPath,
"mem_file_path": memPath,
"resume_vm": false,
})
}
// loadSnapshotWithUffd loads a VM snapshot using a UFFD socket for
// lazy memory loading. Firecracker will connect to the socket and
// send the uffd fd + memory region mappings.

View File

@ -53,7 +53,7 @@ func (m *Manager) Create(ctx context.Context, cfg VMConfig) (*VM, error) {
// Step 2: Wait for the API socket to appear.
if err := waitForSocket(ctx, cfg.SocketPath, proc); err != nil {
proc.stop()
_ = proc.stop()
return nil, fmt.Errorf("wait for socket: %w", err)
}
@ -61,13 +61,13 @@ func (m *Manager) Create(ctx context.Context, cfg VMConfig) (*VM, error) {
client := newFCClient(cfg.SocketPath)
if err := configureVM(ctx, client, &cfg); err != nil {
proc.stop()
_ = proc.stop()
return nil, fmt.Errorf("configure VM: %w", err)
}
// Step 4: Start the VM.
if err := client.startVM(ctx); err != nil {
proc.stop()
_ = proc.stop()
return nil, fmt.Errorf("start VM: %w", err)
}
@ -218,7 +218,7 @@ func (m *Manager) CreateFromSnapshot(ctx context.Context, cfg VMConfig, snapPath
// Step 2: Wait for the API socket.
if err := waitForSocket(ctx, cfg.SocketPath, proc); err != nil {
proc.stop()
_ = proc.stop()
return nil, fmt.Errorf("wait for socket: %w", err)
}
@ -228,13 +228,13 @@ func (m *Manager) CreateFromSnapshot(ctx context.Context, cfg VMConfig, snapPath
// No boot resources are configured — the snapshot carries kernel,
// drive, network, and machine config state.
if err := client.loadSnapshotWithUffd(ctx, snapPath, uffdSocketPath); err != nil {
proc.stop()
_ = proc.stop()
return nil, fmt.Errorf("load snapshot: %w", err)
}
// Step 4: Resume the VM.
if err := client.resumeVM(ctx); err != nil {
proc.stop()
_ = proc.stop()
return nil, fmt.Errorf("resume VM: %w", err)
}