From 63e9132d3822a5a7d6acdfa1d21f0c45c21d9793 Mon Sep 17 00:00:00 2001 From: pptx704 Date: Fri, 13 Mar 2026 08:25:40 +0600 Subject: [PATCH] Add device-mapper snapshots, test UI, fix pause ordering and lint errors - Replace reflink rootfs copy with device-mapper snapshots (shared read-only loop device per base template, per-sandbox sparse CoW file) - Add devicemapper package with create/restore/remove/flatten operations and refcounted LoopRegistry for base image loop devices - Fix pause ordering: destroy VM before removing dm-snapshot to avoid "device busy" error (FC must release the dm device first) - Add test UI at GET /test for sandbox lifecycle management (create, pause, resume, destroy, exec, snapshot create/list/delete) - Fix DirSize to report actual disk usage (stat.Blocks * 512) instead of apparent size, so sparse CoW files report correctly - Add timing logs to pause flow for performance diagnostics - Fix all lint errors across api, network, vm, uffd, and sandbox packages - Remove obsolete internal/filesystem package (replaced by devicemapper) - Update CLAUDE.md with device-mapper architecture documentation --- CLAUDE.md | 9 +- cmd/host-agent/main.go | 4 + go.mod | 5 +- go.sum | 4 - internal/api/handlers_exec.go | 7 +- internal/api/handlers_exec_stream.go | 5 - internal/api/handlers_files.go | 2 +- internal/api/handlers_files_stream.go | 3 +- internal/api/handlers_sandbox.go | 10 +- internal/api/handlers_snapshots.go | 4 +- internal/api/handlers_test_ui.go | 435 ++++++++++++++++++++++++++ internal/api/middleware.go | 2 +- internal/api/server.go | 5 +- internal/devicemapper/devicemapper.go | 317 +++++++++++++++++++ internal/filesystem/clone.go | 16 - internal/filesystem/images.go | 1 - internal/network/setup.go | 14 +- internal/sandbox/manager.go | 403 +++++++++++++++++++----- internal/snapshot/local.go | 84 ++++- internal/uffd/server.go | 2 +- internal/vm/config.go | 4 +- internal/vm/fc.go | 9 - internal/vm/manager.go | 12 +- 23 files changed, 1202 insertions(+), 155 deletions(-) create mode 100644 internal/api/handlers_test_ui.go create mode 100644 internal/devicemapper/devicemapper.go delete mode 100644 internal/filesystem/clone.go delete mode 100644 internal/filesystem/images.go diff --git a/CLAUDE.md b/CLAUDE.md index 91bb6f0..3e29913 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -74,15 +74,15 @@ Startup (`cmd/control-plane/main.go`) wires: config (env vars) → pgxpool → ` ### Host Agent -**Packages:** `internal/hostagent/`, `internal/sandbox/`, `internal/vm/`, `internal/network/`, `internal/filesystem/`, `internal/envdclient/`, `internal/snapshot/` +**Packages:** `internal/hostagent/`, `internal/sandbox/`, `internal/vm/`, `internal/network/`, `internal/devicemapper/`, `internal/envdclient/`, `internal/snapshot/` -Startup (`cmd/host-agent/main.go`) wires: root check → enable IP forwarding → `sandbox.Manager` (containing `vm.Manager` + `network.SlotAllocator`) → `hostagent.Server` (Connect RPC handler) → HTTP server. +Startup (`cmd/host-agent/main.go`) wires: root check → enable IP forwarding → clean up stale dm devices → `sandbox.Manager` (containing `vm.Manager` + `network.SlotAllocator` + `devicemapper.LoopRegistry`) → `hostagent.Server` (Connect RPC handler) → HTTP server. - **RPC Server** (`internal/hostagent/server.go`): implements `hostagentv1connect.HostAgentServiceHandler`. Thin wrapper — every method delegates to `sandbox.Manager`. Maps Connect error codes on return. - **Sandbox Manager** (`internal/sandbox/manager.go`): the core orchestration layer. Maintains in-memory state in `boxes map[string]*sandboxState` (protected by `sync.RWMutex`). Each `sandboxState` holds a `models.Sandbox`, a `*network.Slot`, and an `*envdclient.Client`. Runs a TTL reaper (every 10s) that auto-destroys timed-out sandboxes. - **VM Manager** (`internal/vm/manager.go`, `fc.go`, `config.go`): manages Firecracker processes. Uses raw HTTP API over Unix socket (`/tmp/fc-{sandboxID}.sock`), not the firecracker-go-sdk Machine type. Launches Firecracker via `unshare -m` + `ip netns exec`. Configures VM via PUT to `/boot-source`, `/drives/rootfs`, `/network-interfaces/eth0`, `/machine-config`, then starts with PUT `/actions`. - **Network** (`internal/network/setup.go`, `allocator.go`): per-sandbox network namespace with veth pair + TAP device. See Networking section below. -- **Filesystem** (`internal/filesystem/clone.go`): CoW rootfs clones via `cp --reflink=auto`. +- **Device Mapper** (`internal/devicemapper/devicemapper.go`): CoW rootfs via device-mapper snapshots. Shared read-only loop devices per base template (refcounted `LoopRegistry`), per-sandbox sparse CoW files, dm-snapshot create/restore/remove/flatten operations. - **envd Client** (`internal/envdclient/client.go`, `health.go`): dual interface to the guest agent. Connect RPC for streaming process exec (`process.Start()` bidirectional stream). Plain HTTP for file operations (POST/GET `/files?path=...&username=root`). Health check polls `GET /health` every 100ms until ready (30s timeout). ### envd (Guest Agent) @@ -132,7 +132,7 @@ HIBERNATED → RUNNING (cold snapshot resume, slower) **Sandbox creation** (`POST /v1/sandboxes`): 1. API handler generates sandbox ID, inserts into DB as "pending" 2. RPC `CreateSandbox` → host agent → `sandbox.Manager.Create()` -3. Manager: resolve base rootfs → `cp --reflink` clone → allocate network slot → `CreateNetwork()` (netns + veth + tap + NAT) → `vm.Create()` (start Firecracker, configure via HTTP API, boot) → `envdclient.WaitUntilReady()` (poll /health) → store in-memory state +3. Manager: resolve base rootfs → acquire shared loop device → create dm-snapshot (sparse CoW file) → allocate network slot → `CreateNetwork()` (netns + veth + tap + NAT) → `vm.Create()` (start Firecracker with `/dev/mapper/wrenn-{id}`, configure via HTTP API, boot) → `envdclient.WaitUntilReady()` (poll /health) → store in-memory state 4. API handler updates DB to "running" with host_ip **Command execution** (`POST /v1/sandboxes/{id}/exec`): @@ -181,6 +181,7 @@ To add a new query: add it to the appropriate `.sql` file in `db/queries/` → ` - **Buf + protoc-gen-connect-go** for code generation (not protoc-gen-go-grpc) - **Raw Firecracker HTTP API** via Unix socket (not firecracker-go-sdk Machine type) - **TAP networking** (not vsock) for host-to-envd communication +- **Device-mapper snapshots** for rootfs CoW — shared read-only loop device per base template, per-sandbox sparse CoW file, Firecracker gets `/dev/mapper/wrenn-{id}` - **PostgreSQL** via pgx/v5 + sqlc (type-safe query generation). Goose for migrations (plain SQL, up/down) - **Admin UI**: htmx + Go html/template + chi router. No SPA, no React, no build step - **Lago** for billing (external service, not in this codebase) diff --git a/cmd/host-agent/main.go b/cmd/host-agent/main.go index a204517..c31a8cf 100644 --- a/cmd/host-agent/main.go +++ b/cmd/host-agent/main.go @@ -9,6 +9,7 @@ import ( "syscall" "time" + "git.omukk.dev/wrenn/sandbox/internal/devicemapper" "git.omukk.dev/wrenn/sandbox/internal/hostagent" "git.omukk.dev/wrenn/sandbox/internal/sandbox" "git.omukk.dev/wrenn/sandbox/proto/hostagent/gen/hostagentv1connect" @@ -29,6 +30,9 @@ func main() { slog.Warn("failed to enable ip_forward", "error", err) } + // Clean up any stale dm-snapshot devices from a previous crash. + devicemapper.CleanupStaleDevices() + listenAddr := envOrDefault("AGENT_LISTEN_ADDR", ":50051") kernelPath := envOrDefault("AGENT_KERNEL_PATH", "/var/lib/wrenn/kernels/vmlinux") imagesPath := envOrDefault("AGENT_IMAGES_PATH", "/var/lib/wrenn/images") diff --git a/go.mod b/go.mod index eaa914d..11c8bcb 100644 --- a/go.mod +++ b/go.mod @@ -5,20 +5,19 @@ go 1.25.0 require ( connectrpc.com/connect v1.19.1 github.com/go-chi/chi/v5 v5.2.5 + github.com/google/uuid v1.6.0 github.com/gorilla/websocket v1.5.3 github.com/jackc/pgx/v5 v5.8.0 github.com/vishvananda/netlink v1.1.1-0.20210330154013-f5de75959ad5 github.com/vishvananda/netns v0.0.0-20210104183010-2eb08e3e575f + golang.org/x/sys v0.42.0 google.golang.org/protobuf v1.36.11 ) require ( - github.com/bits-and-blooms/bitset v1.24.4 // indirect - github.com/google/uuid v1.6.0 // indirect github.com/jackc/pgpassfile v1.0.0 // indirect github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect github.com/jackc/puddle/v2 v2.2.2 // indirect golang.org/x/sync v0.17.0 // indirect - golang.org/x/sys v0.42.0 // indirect golang.org/x/text v0.29.0 // indirect ) diff --git a/go.sum b/go.sum index 703caad..4997587 100644 --- a/go.sum +++ b/go.sum @@ -1,7 +1,5 @@ connectrpc.com/connect v1.19.1 h1:R5M57z05+90EfEvCY1b7hBxDVOUl45PrtXtAV2fOC14= connectrpc.com/connect v1.19.1/go.mod h1:tN20fjdGlewnSFeZxLKb0xwIZ6ozc3OQs2hTXy4du9w= -github.com/bits-and-blooms/bitset v1.24.4 h1:95H15Og1clikBrKr/DuzMXkQzECs1M6hhoGXLwLQOZE= -github.com/bits-and-blooms/bitset v1.24.4/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= @@ -37,8 +35,6 @@ golang.org/x/sync v0.17.0 h1:l60nONMj9l5drqw6jlhIELNv9I0A4OFgRsG9k2oT9Ug= golang.org/x/sync v0.17.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= golang.org/x/sys v0.0.0-20200217220822-9197077df867/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200728102440-3e129f6d46b1/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.41.0 h1:Ivj+2Cp/ylzLiEU89QhWblYnOE9zerudt9Ftecq2C6k= -golang.org/x/sys v0.41.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo= golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= golang.org/x/text v0.29.0 h1:1neNs90w9YzJ9BocxfsQNHKuAT4pkghyXc4nhZ6sJvk= diff --git a/internal/api/handlers_exec.go b/internal/api/handlers_exec.go index 18aa723..8323df4 100644 --- a/internal/api/handlers_exec.go +++ b/internal/api/handlers_exec.go @@ -3,6 +3,7 @@ package api import ( "encoding/base64" "encoding/json" + "log/slog" "net/http" "time" "unicode/utf8" @@ -85,13 +86,15 @@ func (h *execHandler) Exec(w http.ResponseWriter, r *http.Request) { duration := time.Since(start) // Update last active. - h.db.UpdateLastActive(ctx, db.UpdateLastActiveParams{ + if err := h.db.UpdateLastActive(ctx, db.UpdateLastActiveParams{ ID: sandboxID, LastActiveAt: pgtype.Timestamptz{ Time: time.Now(), Valid: true, }, - }) + }); err != nil { + slog.Warn("failed to update last_active_at", "id", sandboxID, "error", err) + } // Use base64 encoding if output contains non-UTF-8 bytes. stdout := resp.Msg.Stdout diff --git a/internal/api/handlers_exec_stream.go b/internal/api/handlers_exec_stream.go index def933f..a2be27d 100644 --- a/internal/api/handlers_exec_stream.go +++ b/internal/api/handlers_exec_stream.go @@ -37,11 +37,6 @@ type wsStartMsg struct { Args []string `json:"args"` } -// wsStopMsg is sent by the client to stop the process. -type wsStopMsg struct { - Type string `json:"type"` // "stop" -} - // wsOutMsg is sent by the server for process events. type wsOutMsg struct { Type string `json:"type"` // "start", "stdout", "stderr", "exit", "error" diff --git a/internal/api/handlers_files.go b/internal/api/handlers_files.go index d844d08..71a3aea 100644 --- a/internal/api/handlers_files.go +++ b/internal/api/handlers_files.go @@ -128,5 +128,5 @@ func (h *filesHandler) Download(w http.ResponseWriter, r *http.Request) { } w.Header().Set("Content-Type", "application/octet-stream") - w.Write(resp.Msg.Content) + _, _ = w.Write(resp.Msg.Content) } diff --git a/internal/api/handlers_files_stream.go b/internal/api/handlers_files_stream.go index 999bbd5..7999a2f 100644 --- a/internal/api/handlers_files_stream.go +++ b/internal/api/handlers_files_stream.go @@ -2,6 +2,7 @@ package api import ( "io" + "log/slog" "mime" "mime/multipart" "net/http" @@ -189,6 +190,6 @@ func (h *filesStreamHandler) StreamDownload(w http.ResponseWriter, r *http.Reque if err := stream.Err(); err != nil { // Headers already sent, nothing we can do but log. - // The client will see a truncated response. + slog.Warn("file stream error after headers sent", "error", err) } } diff --git a/internal/api/handlers_sandbox.go b/internal/api/handlers_sandbox.go index 6965ef1..aa4ed07 100644 --- a/internal/api/handlers_sandbox.go +++ b/internal/api/handlers_sandbox.go @@ -111,7 +111,7 @@ func (h *sandboxHandler) Create(w http.ResponseWriter, r *http.Request) { sandboxID := id.NewSandboxID() // Insert pending record. - sb, err := h.db.InsertSandbox(ctx, db.InsertSandboxParams{ + _, err := h.db.InsertSandbox(ctx, db.InsertSandboxParams{ ID: sandboxID, OwnerID: "", HostID: "default", @@ -136,9 +136,11 @@ func (h *sandboxHandler) Create(w http.ResponseWriter, r *http.Request) { TimeoutSec: req.TimeoutSec, })) if err != nil { - h.db.UpdateSandboxStatus(ctx, db.UpdateSandboxStatusParams{ + if _, dbErr := h.db.UpdateSandboxStatus(ctx, db.UpdateSandboxStatusParams{ ID: sandboxID, Status: "error", - }) + }); dbErr != nil { + slog.Warn("failed to update sandbox status to error", "id", sandboxID, "error", dbErr) + } status, code, msg := agentErrToHTTP(err) writeError(w, status, code, msg) return @@ -146,7 +148,7 @@ func (h *sandboxHandler) Create(w http.ResponseWriter, r *http.Request) { // Update to running. now := time.Now() - sb, err = h.db.UpdateSandboxRunning(ctx, db.UpdateSandboxRunningParams{ + sb, err := h.db.UpdateSandboxRunning(ctx, db.UpdateSandboxRunningParams{ ID: sandboxID, HostIp: resp.Msg.HostIp, GuestIp: "", diff --git a/internal/api/handlers_snapshots.go b/internal/api/handlers_snapshots.go index 7de1643..d07fee9 100644 --- a/internal/api/handlers_snapshots.go +++ b/internal/api/handlers_snapshots.go @@ -84,7 +84,9 @@ func (h *snapshotHandler) Create(w http.ResponseWriter, r *http.Request) { return } // Delete existing template record and files. - h.db.DeleteTemplate(ctx, req.Name) + if err := h.db.DeleteTemplate(ctx, req.Name); err != nil { + slog.Warn("failed to delete existing template", "name", req.Name, "error", err) + } } // Verify sandbox exists and is running or paused. diff --git a/internal/api/handlers_test_ui.go b/internal/api/handlers_test_ui.go new file mode 100644 index 0000000..7ac3c89 --- /dev/null +++ b/internal/api/handlers_test_ui.go @@ -0,0 +1,435 @@ +package api + +import ( + "fmt" + "net/http" +) + +func serveTestUI(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/html; charset=utf-8") + fmt.Fprint(w, testUIHTML) +} + +const testUIHTML = ` + + + + +Wrenn Sandbox — Test Console + + + + +

Wrenn Sandbox Test Console

+ +
+ +
+

Create Sandbox

+ + + + + + + + +
+ +
+
+ + +
+

Create Snapshot

+ + + + +
+ + +
+ +

Snapshots / Templates

+
+ +
+
+
+ + +
+

Execute Command

+ + + + + + +
+ +
+ +
+ + +
+

Activity Log

+
+
+ + +
+

Sandboxes

+
+ + +
+
+
+
+ + + +` diff --git a/internal/api/middleware.go b/internal/api/middleware.go index 3e132e8..00cf1c9 100644 --- a/internal/api/middleware.go +++ b/internal/api/middleware.go @@ -24,7 +24,7 @@ type errorDetail struct { func writeJSON(w http.ResponseWriter, status int, v any) { w.Header().Set("Content-Type", "application/json") w.WriteHeader(status) - json.NewEncoder(w).Encode(v) + _ = json.NewEncoder(w).Encode(v) } func writeError(w http.ResponseWriter, status int, code, message string) { diff --git a/internal/api/server.go b/internal/api/server.go index c6afcb7..78dc26b 100644 --- a/internal/api/server.go +++ b/internal/api/server.go @@ -35,6 +35,9 @@ func New(queries *db.Queries, agent hostagentv1connect.HostAgentServiceClient) * r.Get("/openapi.yaml", serveOpenAPI) r.Get("/docs", serveDocs) + // Test UI for sandbox lifecycle management. + r.Get("/test", serveTestUI) + // Sandbox CRUD. r.Route("/v1/sandboxes", func(r chi.Router) { r.Post("/", sandbox.Create) @@ -71,7 +74,7 @@ func (s *Server) Handler() http.Handler { func serveOpenAPI(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "application/yaml") - w.Write(openapiYAML) + _, _ = w.Write(openapiYAML) } func serveDocs(w http.ResponseWriter, r *http.Request) { diff --git a/internal/devicemapper/devicemapper.go b/internal/devicemapper/devicemapper.go new file mode 100644 index 0000000..78801d3 --- /dev/null +++ b/internal/devicemapper/devicemapper.go @@ -0,0 +1,317 @@ +// Package devicemapper provides device-mapper snapshot operations for +// copy-on-write rootfs management. Each sandbox gets a dm-snapshot backed +// by a shared read-only loop device (the base template image) and a +// per-sandbox sparse CoW file that stores only modified blocks. +package devicemapper + +import ( + "fmt" + "log/slog" + "os" + "os/exec" + "strconv" + "strings" + "sync" +) + +const ( + // ChunkSize is the dm-snapshot chunk size in 512-byte sectors. + // 8 sectors = 4KB, matching the standard page/block size. + ChunkSize = 8 +) + +// loopEntry tracks a loop device and its reference count. +type loopEntry struct { + device string // e.g., /dev/loop0 + refcount int +} + +// LoopRegistry manages loop devices for base template images. +// Each unique image path gets one read-only loop device, shared +// across all sandboxes using that template. Reference counting +// ensures the loop device is released when no sandboxes use it. +type LoopRegistry struct { + mu sync.Mutex + entries map[string]*loopEntry // imagePath → loopEntry +} + +// NewLoopRegistry creates a new loop device registry. +func NewLoopRegistry() *LoopRegistry { + return &LoopRegistry{ + entries: make(map[string]*loopEntry), + } +} + +// Acquire returns a read-only loop device for the given image path. +// If one already exists, its refcount is incremented. Otherwise a new +// loop device is created via losetup. +func (r *LoopRegistry) Acquire(imagePath string) (string, error) { + r.mu.Lock() + defer r.mu.Unlock() + + if e, ok := r.entries[imagePath]; ok { + e.refcount++ + slog.Debug("loop device reused", "image", imagePath, "device", e.device, "refcount", e.refcount) + return e.device, nil + } + + dev, err := losetupCreate(imagePath) + if err != nil { + return "", fmt.Errorf("losetup %s: %w", imagePath, err) + } + + r.entries[imagePath] = &loopEntry{device: dev, refcount: 1} + slog.Info("loop device created", "image", imagePath, "device", dev) + return dev, nil +} + +// Release decrements the refcount for the given image path. +// When the refcount reaches zero, the loop device is detached. +func (r *LoopRegistry) Release(imagePath string) { + r.mu.Lock() + defer r.mu.Unlock() + + e, ok := r.entries[imagePath] + if !ok { + return + } + + e.refcount-- + if e.refcount <= 0 { + if err := losetupDetach(e.device); err != nil { + slog.Warn("losetup detach failed", "device", e.device, "error", err) + } + delete(r.entries, imagePath) + slog.Info("loop device released", "image", imagePath, "device", e.device) + } +} + +// ReleaseAll detaches all loop devices. Used during shutdown. +func (r *LoopRegistry) ReleaseAll() { + r.mu.Lock() + defer r.mu.Unlock() + + for path, e := range r.entries { + if err := losetupDetach(e.device); err != nil { + slog.Warn("losetup detach failed", "device", e.device, "error", err) + } + delete(r.entries, path) + } +} + +// SnapshotDevice holds the state for a single dm-snapshot device. +type SnapshotDevice struct { + Name string // dm device name, e.g., "wrenn-sb-a1b2c3d4" + DevicePath string // /dev/mapper/ + CowPath string // path to the sparse CoW file + CowLoopDev string // loop device for the CoW file +} + +// CreateSnapshot sets up a new dm-snapshot device. +// +// It creates a sparse CoW file, attaches it as a loop device, and creates +// a device-mapper snapshot target combining the read-only origin with the +// writable CoW layer. +// +// The origin loop device must already exist (from LoopRegistry.Acquire). +func CreateSnapshot(name, originLoopDev, cowPath string, originSizeBytes int64) (*SnapshotDevice, error) { + // Create sparse CoW file sized to match the origin. + if err := createSparseFile(cowPath, originSizeBytes); err != nil { + return nil, fmt.Errorf("create cow file: %w", err) + } + + cowLoopDev, err := losetupCreateRW(cowPath) + if err != nil { + os.Remove(cowPath) + return nil, fmt.Errorf("losetup cow: %w", err) + } + + sectors := originSizeBytes / 512 + if err := dmsetupCreate(name, originLoopDev, cowLoopDev, sectors); err != nil { + if detachErr := losetupDetach(cowLoopDev); detachErr != nil { + slog.Warn("cow losetup detach failed during cleanup", "device", cowLoopDev, "error", detachErr) + } + os.Remove(cowPath) + return nil, fmt.Errorf("dmsetup create: %w", err) + } + + devPath := "/dev/mapper/" + name + + slog.Info("dm-snapshot created", + "name", name, + "device", devPath, + "origin", originLoopDev, + "cow", cowPath, + ) + + return &SnapshotDevice{ + Name: name, + DevicePath: devPath, + CowPath: cowPath, + CowLoopDev: cowLoopDev, + }, nil +} + +// RestoreSnapshot re-attaches a dm-snapshot from an existing persistent CoW file. +// The CoW file must have been created with the persistent (P) flag and still +// contain valid dm-snapshot metadata. +func RestoreSnapshot(name, originLoopDev, cowPath string, originSizeBytes int64) (*SnapshotDevice, error) { + cowLoopDev, err := losetupCreateRW(cowPath) + if err != nil { + return nil, fmt.Errorf("losetup cow: %w", err) + } + + sectors := originSizeBytes / 512 + if err := dmsetupCreate(name, originLoopDev, cowLoopDev, sectors); err != nil { + if detachErr := losetupDetach(cowLoopDev); detachErr != nil { + slog.Warn("cow losetup detach failed during cleanup", "device", cowLoopDev, "error", detachErr) + } + return nil, fmt.Errorf("dmsetup create: %w", err) + } + + devPath := "/dev/mapper/" + name + + slog.Info("dm-snapshot restored", + "name", name, + "device", devPath, + "origin", originLoopDev, + "cow", cowPath, + ) + + return &SnapshotDevice{ + Name: name, + DevicePath: devPath, + CowPath: cowPath, + CowLoopDev: cowLoopDev, + }, nil +} + +// RemoveSnapshot tears down a dm-snapshot device and its CoW loop device. +// The CoW file is NOT deleted — the caller decides whether to keep or remove it. +func RemoveSnapshot(dev *SnapshotDevice) error { + if err := dmsetupRemove(dev.Name); err != nil { + return fmt.Errorf("dmsetup remove %s: %w", dev.Name, err) + } + + if err := losetupDetach(dev.CowLoopDev); err != nil { + slog.Warn("cow losetup detach failed", "device", dev.CowLoopDev, "error", err) + } + + slog.Info("dm-snapshot removed", "name", dev.Name) + return nil +} + +// FlattenSnapshot reads the full contents of a dm-snapshot device and writes +// it to a new file. This merges the base image + CoW changes into a standalone +// rootfs image suitable for use as a new template. +func FlattenSnapshot(dmDevPath, outputPath string) error { + cmd := exec.Command("dd", + "if="+dmDevPath, + "of="+outputPath, + "bs=4M", + "status=none", + ) + if out, err := cmd.CombinedOutput(); err != nil { + os.Remove(outputPath) + return fmt.Errorf("dd flatten: %s: %w", string(out), err) + } + return nil +} + +// OriginSizeBytes returns the size in bytes of a loop device's backing file. +func OriginSizeBytes(loopDev string) (int64, error) { + // blockdev --getsize64 returns size in bytes. + out, err := exec.Command("blockdev", "--getsize64", loopDev).CombinedOutput() + if err != nil { + return 0, fmt.Errorf("blockdev --getsize64 %s: %s: %w", loopDev, strings.TrimSpace(string(out)), err) + } + s := strings.TrimSpace(string(out)) + return strconv.ParseInt(s, 10, 64) +} + +// CleanupStaleDevices removes any device-mapper devices matching the +// "wrenn-" prefix that may have been left behind by a previous agent +// instance that crashed or was killed. Should be called at agent startup. +func CleanupStaleDevices() { + out, err := exec.Command("dmsetup", "ls", "--target", "snapshot").CombinedOutput() + if err != nil { + slog.Debug("dmsetup ls failed (may be normal if no devices exist)", "error", err) + return + } + + for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") { + if line == "" || line == "No devices found" { + continue + } + // dmsetup ls output format: "name\t(major:minor)" + name, _, _ := strings.Cut(line, "\t") + if !strings.HasPrefix(name, "wrenn-") { + continue + } + + slog.Warn("removing stale dm-snapshot device", "name", name) + if err := dmsetupRemove(name); err != nil { + slog.Warn("failed to remove stale device", "name", name, "error", err) + } + } +} + +// --- low-level helpers --- + +// losetupCreate attaches a file as a read-only loop device. +func losetupCreate(imagePath string) (string, error) { + out, err := exec.Command("losetup", "--read-only", "--find", "--show", imagePath).Output() + if err != nil { + return "", fmt.Errorf("losetup --read-only: %w", err) + } + return strings.TrimSpace(string(out)), nil +} + +// losetupCreateRW attaches a file as a read-write loop device. +func losetupCreateRW(path string) (string, error) { + out, err := exec.Command("losetup", "--find", "--show", path).Output() + if err != nil { + return "", fmt.Errorf("losetup: %w", err) + } + return strings.TrimSpace(string(out)), nil +} + +// losetupDetach detaches a loop device. +func losetupDetach(dev string) error { + return exec.Command("losetup", "-d", dev).Run() +} + +// dmsetupCreate creates a dm-snapshot device with persistent metadata. +func dmsetupCreate(name, originDev, cowDev string, sectors int64) error { + // Table format: snapshot P + // P = persistent — CoW metadata survives dmsetup remove. + table := fmt.Sprintf("0 %d snapshot %s %s P %d", sectors, originDev, cowDev, ChunkSize) + cmd := exec.Command("dmsetup", "create", name, "--table", table) + if out, err := cmd.CombinedOutput(); err != nil { + return fmt.Errorf("%s: %w", strings.TrimSpace(string(out)), err) + } + return nil +} + +// dmsetupRemove removes a device-mapper device. +func dmsetupRemove(name string) error { + cmd := exec.Command("dmsetup", "remove", name) + if out, err := cmd.CombinedOutput(); err != nil { + return fmt.Errorf("%s: %w", strings.TrimSpace(string(out)), err) + } + return nil +} + +// createSparseFile creates a sparse file of the given size. +func createSparseFile(path string, sizeBytes int64) error { + f, err := os.Create(path) + if err != nil { + return err + } + if err := f.Truncate(sizeBytes); err != nil { + f.Close() + os.Remove(path) + return err + } + return f.Close() +} diff --git a/internal/filesystem/clone.go b/internal/filesystem/clone.go deleted file mode 100644 index 28abedb..0000000 --- a/internal/filesystem/clone.go +++ /dev/null @@ -1,16 +0,0 @@ -package filesystem - -import ( - "fmt" - "os/exec" -) - -// CloneRootfs creates a copy-on-write clone of the base rootfs image. -// Uses reflink if supported by the filesystem, falls back to regular copy. -func CloneRootfs(src, dst string) error { - cmd := exec.Command("cp", "--reflink=auto", src, dst) - if out, err := cmd.CombinedOutput(); err != nil { - return fmt.Errorf("cp --reflink=auto: %s: %w", string(out), err) - } - return nil -} diff --git a/internal/filesystem/images.go b/internal/filesystem/images.go deleted file mode 100644 index e4b5af9..0000000 --- a/internal/filesystem/images.go +++ /dev/null @@ -1 +0,0 @@ -package filesystem diff --git a/internal/network/setup.go b/internal/network/setup.go index 60212d9..933d68e 100644 --- a/internal/network/setup.go +++ b/internal/network/setup.go @@ -104,7 +104,7 @@ func CreateNetwork(slot *Slot) error { return fmt.Errorf("get host namespace: %w", err) } defer hostNS.Close() - defer netns.Set(hostNS) + defer func() { _ = netns.Set(hostNS) }() // Create named network namespace. ns, err := netns.NewNamed(slot.NamespaceID) @@ -304,17 +304,17 @@ func RemoveNetwork(slot *Slot) error { // Remove host-side iptables rules (best effort). if defaultIface != "" { - iptablesHost( + _ = iptablesHost( "-D", "FORWARD", "-i", slot.VethName, "-o", defaultIface, "-j", "ACCEPT", ) - iptablesHost( + _ = iptablesHost( "-D", "FORWARD", "-i", defaultIface, "-o", slot.VethName, "-j", "ACCEPT", ) - iptablesHost( + _ = iptablesHost( "-t", "nat", "-D", "POSTROUTING", "-s", fmt.Sprintf("%s/32", slot.VpeerIP.String()), "-o", defaultIface, @@ -324,18 +324,18 @@ func RemoveNetwork(slot *Slot) error { // Remove host route. _, hostNet, _ := net.ParseCIDR(fmt.Sprintf("%s/32", slot.HostIP.String())) - netlink.RouteDel(&netlink.Route{ + _ = netlink.RouteDel(&netlink.Route{ Dst: hostNet, Gw: slot.VpeerIP, }) // Delete veth (also destroys the peer in the namespace). if veth, err := netlink.LinkByName(slot.VethName); err == nil { - netlink.LinkDel(veth) + _ = netlink.LinkDel(veth) } // Delete the named namespace. - netns.DeleteNamed(slot.NamespaceID) + _ = netns.DeleteNamed(slot.NamespaceID) slog.Info("network removed", "ns", slot.NamespaceID) diff --git a/internal/sandbox/manager.go b/internal/sandbox/manager.go index 1b2c0ee..9ea7865 100644 --- a/internal/sandbox/manager.go +++ b/internal/sandbox/manager.go @@ -11,8 +11,8 @@ import ( "github.com/google/uuid" + "git.omukk.dev/wrenn/sandbox/internal/devicemapper" "git.omukk.dev/wrenn/sandbox/internal/envdclient" - "git.omukk.dev/wrenn/sandbox/internal/filesystem" "git.omukk.dev/wrenn/sandbox/internal/id" "git.omukk.dev/wrenn/sandbox/internal/models" "git.omukk.dev/wrenn/sandbox/internal/network" @@ -35,6 +35,7 @@ type Manager struct { cfg Config vm *vm.Manager slots *network.SlotAllocator + loops *devicemapper.LoopRegistry mu sync.RWMutex boxes map[string]*sandboxState stopCh chan struct{} @@ -46,6 +47,8 @@ type sandboxState struct { slot *network.Slot client *envdclient.Client uffdSocketPath string // non-empty for sandboxes restored from snapshot + dmDevice *devicemapper.SnapshotDevice + baseImagePath string // path to the base template rootfs (for loop registry release) } // New creates a new sandbox manager. @@ -57,6 +60,7 @@ func New(cfg Config) *Manager { cfg: cfg, vm: vm.NewManager(), slots: network.NewSlotAllocator(), + loops: devicemapper.NewLoopRegistry(), boxes: make(map[string]*sandboxState), stopCh: make(chan struct{}), } @@ -91,16 +95,33 @@ func (m *Manager) Create(ctx context.Context, sandboxID, template string, vcpus, return nil, fmt.Errorf("base rootfs not found at %s: %w", baseRootfs, err) } - // Clone rootfs. - rootfsPath := filepath.Join(m.cfg.SandboxesDir, fmt.Sprintf("%s-%s.ext4", sandboxID, template)) - if err := filesystem.CloneRootfs(baseRootfs, rootfsPath); err != nil { - return nil, fmt.Errorf("clone rootfs: %w", err) + // Acquire shared read-only loop device for the base image. + originLoop, err := m.loops.Acquire(baseRootfs) + if err != nil { + return nil, fmt.Errorf("acquire loop device: %w", err) + } + + originSize, err := devicemapper.OriginSizeBytes(originLoop) + if err != nil { + m.loops.Release(baseRootfs) + return nil, fmt.Errorf("get origin size: %w", err) + } + + // Create dm-snapshot with per-sandbox CoW file. + dmName := "wrenn-" + sandboxID + cowPath := filepath.Join(m.cfg.SandboxesDir, fmt.Sprintf("%s.cow", sandboxID)) + dmDev, err := devicemapper.CreateSnapshot(dmName, originLoop, cowPath, originSize) + if err != nil { + m.loops.Release(baseRootfs) + return nil, fmt.Errorf("create dm-snapshot: %w", err) } // Allocate network slot. slotIdx, err := m.slots.Allocate() if err != nil { - os.Remove(rootfsPath) + warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev)) + os.Remove(cowPath) + m.loops.Release(baseRootfs) return nil, fmt.Errorf("allocate network slot: %w", err) } slot := network.NewSlot(slotIdx) @@ -108,15 +129,17 @@ func (m *Manager) Create(ctx context.Context, sandboxID, template string, vcpus, // Set up network. if err := network.CreateNetwork(slot); err != nil { m.slots.Release(slotIdx) - os.Remove(rootfsPath) + warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev)) + os.Remove(cowPath) + m.loops.Release(baseRootfs) return nil, fmt.Errorf("create network: %w", err) } - // Boot VM. + // Boot VM — Firecracker gets the dm device path. vmCfg := vm.VMConfig{ SandboxID: sandboxID, KernelPath: m.cfg.KernelPath, - RootfsPath: rootfsPath, + RootfsPath: dmDev.DevicePath, VCPUs: vcpus, MemoryMB: memoryMB, NetworkNamespace: slot.NamespaceID, @@ -128,9 +151,11 @@ func (m *Manager) Create(ctx context.Context, sandboxID, template string, vcpus, } if _, err := m.vm.Create(ctx, vmCfg); err != nil { - network.RemoveNetwork(slot) + warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot)) m.slots.Release(slotIdx) - os.Remove(rootfsPath) + warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev)) + os.Remove(cowPath) + m.loops.Release(baseRootfs) return nil, fmt.Errorf("create VM: %w", err) } @@ -140,10 +165,12 @@ func (m *Manager) Create(ctx context.Context, sandboxID, template string, vcpus, defer waitCancel() if err := client.WaitUntilReady(waitCtx); err != nil { - m.vm.Destroy(context.Background(), sandboxID) - network.RemoveNetwork(slot) + warnErr("vm destroy error", sandboxID, m.vm.Destroy(context.Background(), sandboxID)) + warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot)) m.slots.Release(slotIdx) - os.Remove(rootfsPath) + warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev)) + os.Remove(cowPath) + m.loops.Release(baseRootfs) return nil, fmt.Errorf("wait for envd: %w", err) } @@ -158,12 +185,14 @@ func (m *Manager) Create(ctx context.Context, sandboxID, template string, vcpus, TimeoutSec: timeoutSec, SlotIndex: slotIdx, HostIP: slot.HostIP, - RootfsPath: rootfsPath, + RootfsPath: dmDev.DevicePath, CreatedAt: now, LastActiveAt: now, }, - slot: slot, - client: client, + slot: slot, + client: client, + dmDevice: dmDev, + baseImagePath: baseRootfs, } m.mu.Lock() @@ -174,6 +203,7 @@ func (m *Manager) Create(ctx context.Context, sandboxID, template string, vcpus, "id", sandboxID, "template", template, "host_ip", slot.HostIP.String(), + "dm_device", dmDev.DevicePath, ) return &sb.Sandbox, nil @@ -194,7 +224,7 @@ func (m *Manager) Destroy(ctx context.Context, sandboxID string) error { } // Always clean up pause snapshot files (may exist if sandbox was paused). - snapshot.Remove(m.cfg.SnapshotsDir, sandboxID) + warnErr("snapshot cleanup error", sandboxID, snapshot.Remove(m.cfg.SnapshotsDir, sandboxID)) slog.Info("sandbox destroyed", "id", sandboxID) return nil @@ -209,7 +239,18 @@ func (m *Manager) cleanup(ctx context.Context, sb *sandboxState) { slog.Warn("network cleanup error", "id", sb.ID, "error", err) } m.slots.Release(sb.SlotIndex) - os.Remove(sb.RootfsPath) + + // Tear down dm-snapshot and release the base image loop device. + if sb.dmDevice != nil { + if err := devicemapper.RemoveSnapshot(sb.dmDevice); err != nil { + slog.Warn("dm-snapshot remove error", "id", sb.ID, "error", err) + } + os.Remove(sb.dmDevice.CowPath) + } + if sb.baseImagePath != "" { + m.loops.Release(sb.baseImagePath) + } + if sb.uffdSocketPath != "" { os.Remove(sb.uffdSocketPath) } @@ -228,12 +269,15 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error { return fmt.Errorf("sandbox %s is not running (status: %s)", sandboxID, sb.Status) } + pauseStart := time.Now() + // Step 1: Pause the VM (freeze vCPUs). if err := m.vm.Pause(ctx, sandboxID); err != nil { return fmt.Errorf("pause VM: %w", err) } + slog.Debug("pause: VM paused", "id", sandboxID, "elapsed", time.Since(pauseStart)) - // Step 2: Take a full snapshot (snapfile + memfile). + // Step 2: Take VM state snapshot (snapfile + memfile) — CoW file is saved separately. if err := snapshot.EnsureDir(m.cfg.SnapshotsDir, sandboxID); err != nil { return fmt.Errorf("create snapshot dir: %w", err) } @@ -242,39 +286,74 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error { rawMemPath := filepath.Join(snapDir, "memfile.raw") snapPath := snapshot.SnapPath(m.cfg.SnapshotsDir, sandboxID) + // For UFFD-resumed sandboxes, FC must fault in ALL lazy-loaded pages to + // serialize memory — this is the main bottleneck on re-pause. + snapshotStart := time.Now() if err := m.vm.Snapshot(ctx, sandboxID, snapPath, rawMemPath); err != nil { - snapshot.Remove(m.cfg.SnapshotsDir, sandboxID) + warnErr("snapshot dir cleanup error", sandboxID, snapshot.Remove(m.cfg.SnapshotsDir, sandboxID)) return fmt.Errorf("create VM snapshot: %w", err) } + slog.Debug("pause: FC snapshot created", "id", sandboxID, "elapsed", time.Since(snapshotStart)) // Step 3: Process the raw memfile into a compact diff + header. buildID := uuid.New() diffPath := snapshot.MemDiffPath(m.cfg.SnapshotsDir, sandboxID) headerPath := snapshot.MemHeaderPath(m.cfg.SnapshotsDir, sandboxID) + processStart := time.Now() if _, err := snapshot.ProcessMemfile(rawMemPath, diffPath, headerPath, buildID); err != nil { - snapshot.Remove(m.cfg.SnapshotsDir, sandboxID) + warnErr("snapshot dir cleanup error", sandboxID, snapshot.Remove(m.cfg.SnapshotsDir, sandboxID)) return fmt.Errorf("process memfile: %w", err) } + slog.Debug("pause: memfile processed", "id", sandboxID, "elapsed", time.Since(processStart)) // Remove the raw memfile — we only keep the compact diff. os.Remove(rawMemPath) - // Step 4: Copy rootfs into snapshot dir. - snapshotRootfs := snapshot.RootfsPath(m.cfg.SnapshotsDir, sandboxID) - if err := filesystem.CloneRootfs(sb.RootfsPath, snapshotRootfs); err != nil { - snapshot.Remove(m.cfg.SnapshotsDir, sandboxID) - return fmt.Errorf("copy rootfs: %w", err) + // Step 4: Destroy the VM first so Firecracker releases the dm device. + if err := m.vm.Destroy(ctx, sb.ID); err != nil { + slog.Warn("vm destroy error during pause", "id", sb.ID, "error", err) + } + + // Step 5: Now that FC is gone, safely remove the dm-snapshot and save the CoW. + if sb.dmDevice != nil { + if err := devicemapper.RemoveSnapshot(sb.dmDevice); err != nil { + warnErr("dm-snapshot remove error during pause", sandboxID, err) + } + + // Move (not copy) the CoW file into the snapshot directory. + snapshotCow := snapshot.CowPath(m.cfg.SnapshotsDir, sandboxID) + if err := os.Rename(sb.dmDevice.CowPath, snapshotCow); err != nil { + warnErr("snapshot dir cleanup error", sandboxID, snapshot.Remove(m.cfg.SnapshotsDir, sandboxID)) + return fmt.Errorf("move cow file: %w", err) + } + + // Record which base template this CoW was built against. + if err := snapshot.WriteMeta(m.cfg.SnapshotsDir, sandboxID, &snapshot.RootfsMeta{ + BaseTemplate: sb.baseImagePath, + }); err != nil { + warnErr("snapshot dir cleanup error", sandboxID, snapshot.Remove(m.cfg.SnapshotsDir, sandboxID)) + return fmt.Errorf("write rootfs meta: %w", err) + } + } + + // Step 6: Clean up remaining resources (network, loop device, uffd socket). + if err := network.RemoveNetwork(sb.slot); err != nil { + slog.Warn("network cleanup error during pause", "id", sb.ID, "error", err) + } + m.slots.Release(sb.SlotIndex) + if sb.baseImagePath != "" { + m.loops.Release(sb.baseImagePath) + } + if sb.uffdSocketPath != "" { + os.Remove(sb.uffdSocketPath) } - // Step 5: Destroy the sandbox (free VM, network, rootfs clone). m.mu.Lock() delete(m.boxes, sandboxID) m.mu.Unlock() - m.cleanup(ctx, sb) - - slog.Info("sandbox paused (snapshot + destroy)", "id", sandboxID) + slog.Info("sandbox paused", "id", sandboxID, "total_elapsed", time.Since(pauseStart)) return nil } @@ -307,19 +386,62 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string) (*models.Sandbox return nil, fmt.Errorf("create memory source: %w", err) } - // Clone snapshot rootfs for this sandbox. - snapshotRootfs := snapshot.RootfsPath(snapDir, sandboxID) - rootfsPath := filepath.Join(m.cfg.SandboxesDir, fmt.Sprintf("%s-resume.ext4", sandboxID)) - if err := filesystem.CloneRootfs(snapshotRootfs, rootfsPath); err != nil { + // Read rootfs metadata to find the base template image. + meta, err := snapshot.ReadMeta(snapDir, sandboxID) + if err != nil { source.Close() - return nil, fmt.Errorf("clone snapshot rootfs: %w", err) + return nil, fmt.Errorf("read rootfs meta: %w", err) + } + + // Acquire the base image loop device and restore dm-snapshot from saved CoW. + baseImagePath := meta.BaseTemplate + originLoop, err := m.loops.Acquire(baseImagePath) + if err != nil { + source.Close() + return nil, fmt.Errorf("acquire loop device: %w", err) + } + + originSize, err := devicemapper.OriginSizeBytes(originLoop) + if err != nil { + source.Close() + m.loops.Release(baseImagePath) + return nil, fmt.Errorf("get origin size: %w", err) + } + + // Move CoW file from snapshot dir to sandboxes dir for the running sandbox. + savedCow := snapshot.CowPath(snapDir, sandboxID) + cowPath := filepath.Join(m.cfg.SandboxesDir, fmt.Sprintf("%s.cow", sandboxID)) + if err := os.Rename(savedCow, cowPath); err != nil { + source.Close() + m.loops.Release(baseImagePath) + return nil, fmt.Errorf("move cow file: %w", err) + } + + // rollbackCow attempts to move the CoW file back to the snapshot dir. + // Best-effort — logs a warning if it fails. + rollbackCow := func() { + if err := os.Rename(cowPath, savedCow); err != nil { + slog.Warn("failed to rollback cow file", "src", cowPath, "dst", savedCow, "error", err) + } + } + + // Restore dm-snapshot from existing persistent CoW file. + dmName := "wrenn-" + sandboxID + dmDev, err := devicemapper.RestoreSnapshot(dmName, originLoop, cowPath, originSize) + if err != nil { + source.Close() + m.loops.Release(baseImagePath) + rollbackCow() + return nil, fmt.Errorf("restore dm-snapshot: %w", err) } // Allocate network slot. slotIdx, err := m.slots.Allocate() if err != nil { source.Close() - os.Remove(rootfsPath) + warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev)) + rollbackCow() + m.loops.Release(baseImagePath) return nil, fmt.Errorf("allocate network slot: %w", err) } slot := network.NewSlot(slotIdx) @@ -327,7 +449,9 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string) (*models.Sandbox if err := network.CreateNetwork(slot); err != nil { source.Close() m.slots.Release(slotIdx) - os.Remove(rootfsPath) + warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev)) + rollbackCow() + m.loops.Release(baseImagePath) return nil, fmt.Errorf("create network: %w", err) } @@ -337,9 +461,11 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string) (*models.Sandbox uffdServer := uffd.NewServer(uffdSocketPath, source) if err := uffdServer.Start(ctx); err != nil { source.Close() - network.RemoveNetwork(slot) + warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot)) m.slots.Release(slotIdx) - os.Remove(rootfsPath) + warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev)) + rollbackCow() + m.loops.Release(baseImagePath) return nil, fmt.Errorf("start uffd server: %w", err) } @@ -347,7 +473,7 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string) (*models.Sandbox vmCfg := vm.VMConfig{ SandboxID: sandboxID, KernelPath: m.cfg.KernelPath, - RootfsPath: rootfsPath, + RootfsPath: dmDev.DevicePath, VCPUs: int(header.Metadata.Size / (1024 * 1024)), // Will be overridden by snapshot. MemoryMB: int(header.Metadata.Size / (1024 * 1024)), NetworkNamespace: slot.NamespaceID, @@ -360,11 +486,13 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string) (*models.Sandbox snapPath := snapshot.SnapPath(snapDir, sandboxID) if _, err := m.vm.CreateFromSnapshot(ctx, vmCfg, snapPath, uffdSocketPath); err != nil { - uffdServer.Stop() + warnErr("uffd server stop error", sandboxID, uffdServer.Stop()) source.Close() - network.RemoveNetwork(slot) + warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot)) m.slots.Release(slotIdx) - os.Remove(rootfsPath) + warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev)) + rollbackCow() + m.loops.Release(baseImagePath) return nil, fmt.Errorf("restore VM from snapshot: %w", err) } @@ -374,12 +502,14 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string) (*models.Sandbox defer waitCancel() if err := client.WaitUntilReady(waitCtx); err != nil { - uffdServer.Stop() + warnErr("uffd server stop error", sandboxID, uffdServer.Stop()) source.Close() - m.vm.Destroy(context.Background(), sandboxID) - network.RemoveNetwork(slot) + warnErr("vm destroy error", sandboxID, m.vm.Destroy(context.Background(), sandboxID)) + warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot)) m.slots.Release(slotIdx) - os.Remove(rootfsPath) + warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev)) + os.Remove(cowPath) + m.loops.Release(baseImagePath) return nil, fmt.Errorf("wait for envd: %w", err) } @@ -394,25 +524,29 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string) (*models.Sandbox TimeoutSec: 0, SlotIndex: slotIdx, HostIP: slot.HostIP, - RootfsPath: rootfsPath, + RootfsPath: dmDev.DevicePath, CreatedAt: now, LastActiveAt: now, }, slot: slot, client: client, uffdSocketPath: uffdSocketPath, + dmDevice: dmDev, + baseImagePath: baseImagePath, } m.mu.Lock() m.boxes[sandboxID] = sb m.mu.Unlock() - // Clean up the snapshot files now that the sandbox is running. - snapshot.Remove(snapDir, sandboxID) + // Clean up remaining snapshot files (snapfile, memfile, header, meta). + // The CoW file was already moved out. + warnErr("snapshot cleanup error", sandboxID, snapshot.Remove(snapDir, sandboxID)) slog.Info("sandbox resumed from snapshot", "id", sandboxID, "host_ip", slot.HostIP.String(), + "dm_device", dmDev.DevicePath, ) return &sb.Sandbox, nil @@ -421,7 +555,10 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string) (*models.Sandbox // CreateSnapshot creates a reusable template from a sandbox. Works on both // running and paused sandboxes. If the sandbox is running, it is paused first. // The sandbox remains paused after this call (it can still be resumed). -// The template files are copied to ImagesDir/{name}/. +// +// The rootfs is flattened (base + CoW merged) into a new standalone rootfs.ext4 +// so the template has no dependency on the original base image. Memory state +// and VM snapshot files are copied as-is. func (m *Manager) CreateSnapshot(ctx context.Context, sandboxID, name string) (int64, error) { // If the sandbox is running, pause it first. if _, err := m.get(sandboxID); err == nil { @@ -435,29 +572,73 @@ func (m *Manager) CreateSnapshot(ctx context.Context, sandboxID, name string) (i return 0, fmt.Errorf("no snapshot found for sandbox %s", sandboxID) } - // Copy snapshot files to ImagesDir/{name}/ as a reusable template. + // Create template directory. if err := snapshot.EnsureDir(m.cfg.ImagesDir, name); err != nil { return 0, fmt.Errorf("create template dir: %w", err) } + // Copy VM snapshot and memory files. srcDir := snapshot.DirPath(m.cfg.SnapshotsDir, sandboxID) dstDir := snapshot.DirPath(m.cfg.ImagesDir, name) - for _, fname := range []string{snapshot.SnapFileName, snapshot.MemDiffName, snapshot.MemHeaderName, snapshot.RootfsFileName} { + for _, fname := range []string{snapshot.SnapFileName, snapshot.MemDiffName, snapshot.MemHeaderName} { src := filepath.Join(srcDir, fname) dst := filepath.Join(dstDir, fname) - if err := filesystem.CloneRootfs(src, dst); err != nil { - snapshot.Remove(m.cfg.ImagesDir, name) + if err := copyFile(src, dst); err != nil { + warnErr("template dir cleanup error", name, snapshot.Remove(m.cfg.ImagesDir, name)) return 0, fmt.Errorf("copy %s: %w", fname, err) } } + // Flatten rootfs: temporarily set up dm device from base + CoW, dd to new image. + meta, err := snapshot.ReadMeta(m.cfg.SnapshotsDir, sandboxID) + if err != nil { + warnErr("template dir cleanup error", name, snapshot.Remove(m.cfg.ImagesDir, name)) + return 0, fmt.Errorf("read rootfs meta: %w", err) + } + + originLoop, err := m.loops.Acquire(meta.BaseTemplate) + if err != nil { + warnErr("template dir cleanup error", name, snapshot.Remove(m.cfg.ImagesDir, name)) + return 0, fmt.Errorf("acquire loop device for flatten: %w", err) + } + + originSize, err := devicemapper.OriginSizeBytes(originLoop) + if err != nil { + m.loops.Release(meta.BaseTemplate) + warnErr("template dir cleanup error", name, snapshot.Remove(m.cfg.ImagesDir, name)) + return 0, fmt.Errorf("get origin size: %w", err) + } + + // Temporarily restore the dm-snapshot to read the merged view. + cowPath := snapshot.CowPath(m.cfg.SnapshotsDir, sandboxID) + tmpDmName := "wrenn-flatten-" + sandboxID + tmpDev, err := devicemapper.RestoreSnapshot(tmpDmName, originLoop, cowPath, originSize) + if err != nil { + m.loops.Release(meta.BaseTemplate) + warnErr("template dir cleanup error", name, snapshot.Remove(m.cfg.ImagesDir, name)) + return 0, fmt.Errorf("restore dm-snapshot for flatten: %w", err) + } + + // Flatten to new standalone rootfs. + flattenedPath := snapshot.RootfsPath(m.cfg.ImagesDir, name) + flattenErr := devicemapper.FlattenSnapshot(tmpDev.DevicePath, flattenedPath) + + // Always clean up the temporary dm device. + warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(tmpDev)) + m.loops.Release(meta.BaseTemplate) + + if flattenErr != nil { + warnErr("template dir cleanup error", name, snapshot.Remove(m.cfg.ImagesDir, name)) + return 0, fmt.Errorf("flatten rootfs: %w", flattenErr) + } + sizeBytes, err := snapshot.DirSize(m.cfg.ImagesDir, name) if err != nil { slog.Warn("failed to calculate snapshot size", "error", err) } - slog.Info("snapshot created", + slog.Info("template snapshot created (rootfs flattened)", "sandbox", sandboxID, "name", name, "size_bytes", sizeBytes, @@ -472,7 +653,9 @@ func (m *Manager) DeleteSnapshot(name string) error { // createFromSnapshot creates a new sandbox by restoring from a snapshot template // in ImagesDir/{snapshotName}/. Uses UFFD for lazy memory loading. -func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID, snapshotName string, vcpus, memoryMB, timeoutSec int) (*models.Sandbox, error) { +// The template's rootfs.ext4 is a flattened standalone image — we create a +// dm-snapshot on top of it just like a normal Create. +func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID, snapshotName string, vcpus, _, timeoutSec int) (*models.Sandbox, error) { imagesDir := m.cfg.ImagesDir // Read the header. @@ -486,10 +669,8 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID, snapshotNam return nil, fmt.Errorf("deserialize header: %w", err) } - // Snapshot determines memory size. VCPUs are also baked into the - // snapshot state — the caller should pass the correct value from - // the template DB record. - memoryMB = int(header.Metadata.Size / (1024 * 1024)) + // Snapshot determines memory size. + memoryMB := int(header.Metadata.Size / (1024 * 1024)) // Build diff file map. diffPaths := map[string]string{ @@ -501,19 +682,37 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID, snapshotNam return nil, fmt.Errorf("create memory source: %w", err) } - // Clone snapshot rootfs. - snapshotRootfs := snapshot.RootfsPath(imagesDir, snapshotName) - rootfsPath := filepath.Join(m.cfg.SandboxesDir, fmt.Sprintf("%s-%s.ext4", sandboxID, snapshotName)) - if err := filesystem.CloneRootfs(snapshotRootfs, rootfsPath); err != nil { + // Set up dm-snapshot on the template's flattened rootfs. + baseRootfs := snapshot.RootfsPath(imagesDir, snapshotName) + originLoop, err := m.loops.Acquire(baseRootfs) + if err != nil { source.Close() - return nil, fmt.Errorf("clone snapshot rootfs: %w", err) + return nil, fmt.Errorf("acquire loop device: %w", err) + } + + originSize, err := devicemapper.OriginSizeBytes(originLoop) + if err != nil { + source.Close() + m.loops.Release(baseRootfs) + return nil, fmt.Errorf("get origin size: %w", err) + } + + dmName := "wrenn-" + sandboxID + cowPath := filepath.Join(m.cfg.SandboxesDir, fmt.Sprintf("%s.cow", sandboxID)) + dmDev, err := devicemapper.CreateSnapshot(dmName, originLoop, cowPath, originSize) + if err != nil { + source.Close() + m.loops.Release(baseRootfs) + return nil, fmt.Errorf("create dm-snapshot: %w", err) } // Allocate network. slotIdx, err := m.slots.Allocate() if err != nil { source.Close() - os.Remove(rootfsPath) + warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev)) + os.Remove(cowPath) + m.loops.Release(baseRootfs) return nil, fmt.Errorf("allocate network slot: %w", err) } slot := network.NewSlot(slotIdx) @@ -521,7 +720,9 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID, snapshotNam if err := network.CreateNetwork(slot); err != nil { source.Close() m.slots.Release(slotIdx) - os.Remove(rootfsPath) + warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev)) + os.Remove(cowPath) + m.loops.Release(baseRootfs) return nil, fmt.Errorf("create network: %w", err) } @@ -531,9 +732,11 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID, snapshotNam uffdServer := uffd.NewServer(uffdSocketPath, source) if err := uffdServer.Start(ctx); err != nil { source.Close() - network.RemoveNetwork(slot) + warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot)) m.slots.Release(slotIdx) - os.Remove(rootfsPath) + warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev)) + os.Remove(cowPath) + m.loops.Release(baseRootfs) return nil, fmt.Errorf("start uffd server: %w", err) } @@ -541,7 +744,7 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID, snapshotNam vmCfg := vm.VMConfig{ SandboxID: sandboxID, KernelPath: m.cfg.KernelPath, - RootfsPath: rootfsPath, + RootfsPath: dmDev.DevicePath, VCPUs: vcpus, MemoryMB: memoryMB, NetworkNamespace: slot.NamespaceID, @@ -554,11 +757,13 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID, snapshotNam snapPath := snapshot.SnapPath(imagesDir, snapshotName) if _, err := m.vm.CreateFromSnapshot(ctx, vmCfg, snapPath, uffdSocketPath); err != nil { - uffdServer.Stop() + warnErr("uffd server stop error", sandboxID, uffdServer.Stop()) source.Close() - network.RemoveNetwork(slot) + warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot)) m.slots.Release(slotIdx) - os.Remove(rootfsPath) + warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev)) + os.Remove(cowPath) + m.loops.Release(baseRootfs) return nil, fmt.Errorf("restore VM from snapshot: %w", err) } @@ -568,12 +773,14 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID, snapshotNam defer waitCancel() if err := client.WaitUntilReady(waitCtx); err != nil { - uffdServer.Stop() + warnErr("uffd server stop error", sandboxID, uffdServer.Stop()) source.Close() - m.vm.Destroy(context.Background(), sandboxID) - network.RemoveNetwork(slot) + warnErr("vm destroy error", sandboxID, m.vm.Destroy(context.Background(), sandboxID)) + warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot)) m.slots.Release(slotIdx) - os.Remove(rootfsPath) + warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(dmDev)) + os.Remove(cowPath) + m.loops.Release(baseRootfs) return nil, fmt.Errorf("wait for envd: %w", err) } @@ -588,13 +795,15 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID, snapshotNam TimeoutSec: timeoutSec, SlotIndex: slotIdx, HostIP: slot.HostIP, - RootfsPath: rootfsPath, + RootfsPath: dmDev.DevicePath, CreatedAt: now, LastActiveAt: now, }, slot: slot, client: client, uffdSocketPath: uffdSocketPath, + dmDevice: dmDev, + baseImagePath: baseRootfs, } m.mu.Lock() @@ -605,6 +814,7 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID, snapshotNam "id", sandboxID, "snapshot", snapshotName, "host_ip", slot.HostIP.String(), + "dm_device", dmDev.DevicePath, ) return &sb.Sandbox, nil @@ -735,7 +945,7 @@ func (m *Manager) reapExpired(ctx context.Context) { } } -// Shutdown destroys all sandboxes and stops the TTL reaper. +// Shutdown destroys all sandboxes, releases loop devices, and stops the TTL reaper. func (m *Manager) Shutdown(ctx context.Context) { close(m.stopCh) @@ -752,4 +962,35 @@ func (m *Manager) Shutdown(ctx context.Context) { slog.Warn("shutdown destroy failed", "id", sbID, "error", err) } } + + m.loops.ReleaseAll() +} + +// warnErr logs a warning if err is non-nil. Used for best-effort cleanup +// in error paths where the primary error has already been captured. +func warnErr(msg string, id string, err error) { + if err != nil { + slog.Warn(msg, "id", id, "error", err) + } +} + +// copyFile copies a regular file from src to dst using streaming I/O. +func copyFile(src, dst string) error { + sf, err := os.Open(src) + if err != nil { + return fmt.Errorf("open %s: %w", src, err) + } + defer sf.Close() + + df, err := os.Create(dst) + if err != nil { + return fmt.Errorf("create %s: %w", dst, err) + } + defer df.Close() + + if _, err := df.ReadFrom(sf); err != nil { + os.Remove(dst) + return fmt.Errorf("copy %s → %s: %w", src, dst, err) + } + return nil } diff --git a/internal/snapshot/local.go b/internal/snapshot/local.go index 0ed5dbd..7a7e750 100644 --- a/internal/snapshot/local.go +++ b/internal/snapshot/local.go @@ -1,10 +1,12 @@ package snapshot import ( + "encoding/json" "fmt" "io/fs" "os" "path/filepath" + "syscall" ) const ( @@ -12,6 +14,8 @@ const ( MemDiffName = "memfile" MemHeaderName = "memfile.header" RootfsFileName = "rootfs.ext4" + RootfsCowName = "rootfs.cow" + RootfsMetaName = "rootfs.meta" ) // DirPath returns the snapshot directory for a given name. @@ -39,15 +43,68 @@ func RootfsPath(baseDir, name string) string { return filepath.Join(DirPath(baseDir, name), RootfsFileName) } +// CowPath returns the path to the rootfs CoW diff file. +func CowPath(baseDir, name string) string { + return filepath.Join(DirPath(baseDir, name), RootfsCowName) +} + +// MetaPath returns the path to the rootfs metadata file. +func MetaPath(baseDir, name string) string { + return filepath.Join(DirPath(baseDir, name), RootfsMetaName) +} + +// RootfsMeta records which base template a CoW file was created against. +type RootfsMeta struct { + BaseTemplate string `json:"base_template"` +} + +// WriteMeta writes rootfs metadata to the snapshot directory. +func WriteMeta(baseDir, name string, meta *RootfsMeta) error { + data, err := json.Marshal(meta) + if err != nil { + return fmt.Errorf("marshal rootfs meta: %w", err) + } + if err := os.WriteFile(MetaPath(baseDir, name), data, 0644); err != nil { + return fmt.Errorf("write rootfs meta: %w", err) + } + return nil +} + +// ReadMeta reads rootfs metadata from the snapshot directory. +func ReadMeta(baseDir, name string) (*RootfsMeta, error) { + data, err := os.ReadFile(MetaPath(baseDir, name)) + if err != nil { + return nil, fmt.Errorf("read rootfs meta: %w", err) + } + var meta RootfsMeta + if err := json.Unmarshal(data, &meta); err != nil { + return nil, fmt.Errorf("unmarshal rootfs meta: %w", err) + } + return &meta, nil +} + // Exists reports whether a complete snapshot exists (all required files present). +// Supports both legacy (rootfs.ext4) and CoW-based (rootfs.cow + rootfs.meta) snapshots. func Exists(baseDir, name string) bool { dir := DirPath(baseDir, name) - for _, f := range []string{SnapFileName, MemDiffName, MemHeaderName, RootfsFileName} { + + // Common files required by both formats. + for _, f := range []string{SnapFileName, MemDiffName, MemHeaderName} { if _, err := os.Stat(filepath.Join(dir, f)); err != nil { return false } } - return true + + // Accept either rootfs.ext4 (legacy) or rootfs.cow + rootfs.meta (dm-snapshot). + if _, err := os.Stat(filepath.Join(dir, RootfsFileName)); err == nil { + return true + } + if _, err := os.Stat(filepath.Join(dir, RootfsCowName)); err == nil { + if _, err := os.Stat(filepath.Join(dir, RootfsMetaName)); err == nil { + return true + } + } + return false } // IsTemplate reports whether a template image directory exists (has rootfs.ext4). @@ -61,6 +118,15 @@ func IsSnapshot(baseDir, name string) bool { return Exists(baseDir, name) } +// HasCow reports whether a snapshot uses CoW format (rootfs.cow + rootfs.meta) +// as opposed to legacy full rootfs (rootfs.ext4). +func HasCow(baseDir, name string) bool { + dir := DirPath(baseDir, name) + _, cowErr := os.Stat(filepath.Join(dir, RootfsCowName)) + _, metaErr := os.Stat(filepath.Join(dir, RootfsMetaName)) + return cowErr == nil && metaErr == nil +} + // EnsureDir creates the snapshot directory if it doesn't exist. func EnsureDir(baseDir, name string) error { dir := DirPath(baseDir, name) @@ -75,12 +141,14 @@ func Remove(baseDir, name string) error { return os.RemoveAll(DirPath(baseDir, name)) } -// DirSize returns the total byte size of all files in the snapshot directory. +// DirSize returns the actual disk usage of all files in the snapshot directory. +// Uses block-based accounting (stat.Blocks * 512) so sparse files report only +// the blocks that are actually allocated, not their apparent size. func DirSize(baseDir, name string) (int64, error) { var total int64 dir := DirPath(baseDir, name) - err := filepath.WalkDir(dir, func(_ string, d fs.DirEntry, err error) error { + err := filepath.WalkDir(dir, func(path string, d fs.DirEntry, err error) error { if err != nil { return err } @@ -91,7 +159,13 @@ func DirSize(baseDir, name string) (int64, error) { if err != nil { return err } - total += info.Size() + if sys, ok := info.Sys().(*syscall.Stat_t); ok { + // Blocks is in 512-byte units regardless of filesystem block size. + total += sys.Blocks * 512 + } else { + // Fallback to apparent size if syscall stat is unavailable. + total += info.Size() + } return nil }) if err != nil { diff --git a/internal/uffd/server.go b/internal/uffd/server.go index 7b2e68b..40d29c7 100644 --- a/internal/uffd/server.go +++ b/internal/uffd/server.go @@ -115,7 +115,7 @@ func (s *Server) Ready() <-chan struct{} { // Stop signals the UFFD poll loop to exit and waits for it to finish. func (s *Server) Stop() error { // Write a byte to the exit pipe to wake the poll loop. - s.exitW.Write([]byte{0}) + _, _ = s.exitW.Write([]byte{0}) <-s.doneCh return s.doneErr } diff --git a/internal/vm/config.go b/internal/vm/config.go index 1a40b21..35bc293 100644 --- a/internal/vm/config.go +++ b/internal/vm/config.go @@ -10,8 +10,8 @@ type VMConfig struct { // KernelPath is the path to the uncompressed Linux kernel (vmlinux). KernelPath string - // RootfsPath is the path to the ext4 rootfs image for this sandbox. - // This should be a per-sandbox copy (reflink clone of the base image). + // RootfsPath is the path to the rootfs block device for this sandbox. + // Typically a dm-snapshot device (e.g., /dev/mapper/wrenn-sb-a1b2c3d4). RootfsPath string // VCPUs is the number of virtual CPUs to allocate (default: 1). diff --git a/internal/vm/fc.go b/internal/vm/fc.go index 1acef28..fccde2f 100644 --- a/internal/vm/fc.go +++ b/internal/vm/fc.go @@ -131,15 +131,6 @@ func (c *fcClient) createSnapshot(ctx context.Context, snapPath, memPath string) }) } -// loadSnapshot loads a VM snapshot from a file-backed memory image. -func (c *fcClient) loadSnapshot(ctx context.Context, snapPath, memPath string) error { - return c.do(ctx, http.MethodPut, "/snapshot/load", map[string]any{ - "snapshot_path": snapPath, - "mem_file_path": memPath, - "resume_vm": false, - }) -} - // loadSnapshotWithUffd loads a VM snapshot using a UFFD socket for // lazy memory loading. Firecracker will connect to the socket and // send the uffd fd + memory region mappings. diff --git a/internal/vm/manager.go b/internal/vm/manager.go index a782b42..5e107ea 100644 --- a/internal/vm/manager.go +++ b/internal/vm/manager.go @@ -53,7 +53,7 @@ func (m *Manager) Create(ctx context.Context, cfg VMConfig) (*VM, error) { // Step 2: Wait for the API socket to appear. if err := waitForSocket(ctx, cfg.SocketPath, proc); err != nil { - proc.stop() + _ = proc.stop() return nil, fmt.Errorf("wait for socket: %w", err) } @@ -61,13 +61,13 @@ func (m *Manager) Create(ctx context.Context, cfg VMConfig) (*VM, error) { client := newFCClient(cfg.SocketPath) if err := configureVM(ctx, client, &cfg); err != nil { - proc.stop() + _ = proc.stop() return nil, fmt.Errorf("configure VM: %w", err) } // Step 4: Start the VM. if err := client.startVM(ctx); err != nil { - proc.stop() + _ = proc.stop() return nil, fmt.Errorf("start VM: %w", err) } @@ -218,7 +218,7 @@ func (m *Manager) CreateFromSnapshot(ctx context.Context, cfg VMConfig, snapPath // Step 2: Wait for the API socket. if err := waitForSocket(ctx, cfg.SocketPath, proc); err != nil { - proc.stop() + _ = proc.stop() return nil, fmt.Errorf("wait for socket: %w", err) } @@ -228,13 +228,13 @@ func (m *Manager) CreateFromSnapshot(ctx context.Context, cfg VMConfig, snapPath // No boot resources are configured — the snapshot carries kernel, // drive, network, and machine config state. if err := client.loadSnapshotWithUffd(ctx, snapPath, uffdSocketPath); err != nil { - proc.stop() + _ = proc.stop() return nil, fmt.Errorf("load snapshot: %w", err) } // Step 4: Resume the VM. if err := client.resumeVM(ctx); err != nil { - proc.stop() + _ = proc.stop() return nil, fmt.Errorf("resume VM: %w", err) }