Add sandbox snapshot and restore with UFFD lazy memory loading
Implement full snapshot lifecycle: pause (snapshot + free resources), resume (UFFD-based lazy restore), and named snapshot templates that can spawn new sandboxes from frozen VM state. Key changes: - Snapshot header system with generational diff mapping (inspired by e2b) - UFFD server for lazy page fault handling during snapshot restore - Stable rootfs symlink path (/tmp/fc-vm/) for snapshot compatibility - Templates DB table and CRUD API endpoints (POST/GET/DELETE /v1/snapshots) - CreateSnapshot/DeleteSnapshot RPCs in hostagent proto - Reconciler excludes paused sandboxes (expected absent from host agent) - Snapshot templates lock vcpus/memory to baked-in values - Proper cleanup of uffd sockets and pause snapshot files on destroy
This commit is contained in:
@ -44,7 +44,7 @@ type wsStopMsg struct {
|
||||
|
||||
// wsOutMsg is sent by the server for process events.
|
||||
type wsOutMsg struct {
|
||||
Type string `json:"type"` // "start", "stdout", "stderr", "exit", "error"
|
||||
Type string `json:"type"` // "start", "stdout", "stderr", "exit", "error"
|
||||
PID uint32 `json:"pid,omitempty"` // only for "start"
|
||||
Data string `json:"data,omitempty"` // only for "stdout", "stderr", "error"
|
||||
ExitCode *int32 `json:"exit_code,omitempty"` // only for "exit"
|
||||
|
||||
@ -97,6 +97,17 @@ func (h *sandboxHandler) Create(w http.ResponseWriter, r *http.Request) {
|
||||
}
|
||||
|
||||
ctx := r.Context()
|
||||
|
||||
// If the template is a snapshot, use its baked-in vcpus/memory
|
||||
// (they cannot be changed since the VM state is frozen).
|
||||
if tmpl, err := h.db.GetTemplate(ctx, req.Template); err == nil && tmpl.Type == "snapshot" {
|
||||
if tmpl.Vcpus.Valid {
|
||||
req.VCPUs = tmpl.Vcpus.Int32
|
||||
}
|
||||
if tmpl.MemoryMb.Valid {
|
||||
req.MemoryMB = tmpl.MemoryMb.Int32
|
||||
}
|
||||
}
|
||||
sandboxID := id.NewSandboxID()
|
||||
|
||||
// Insert pending record.
|
||||
@ -182,6 +193,8 @@ func (h *sandboxHandler) Get(w http.ResponseWriter, r *http.Request) {
|
||||
}
|
||||
|
||||
// Pause handles POST /v1/sandboxes/{id}/pause.
|
||||
// Pause = snapshot + destroy. The sandbox is frozen to disk and all running
|
||||
// resources are released. It can be resumed later.
|
||||
func (h *sandboxHandler) Pause(w http.ResponseWriter, r *http.Request) {
|
||||
sandboxID := chi.URLParam(r, "id")
|
||||
ctx := r.Context()
|
||||
@ -216,6 +229,7 @@ func (h *sandboxHandler) Pause(w http.ResponseWriter, r *http.Request) {
|
||||
}
|
||||
|
||||
// Resume handles POST /v1/sandboxes/{id}/resume.
|
||||
// Resume restores a paused sandbox from snapshot using UFFD lazy memory loading.
|
||||
func (h *sandboxHandler) Resume(w http.ResponseWriter, r *http.Request) {
|
||||
sandboxID := chi.URLParam(r, "id")
|
||||
ctx := r.Context()
|
||||
@ -230,16 +244,24 @@ func (h *sandboxHandler) Resume(w http.ResponseWriter, r *http.Request) {
|
||||
return
|
||||
}
|
||||
|
||||
if _, err := h.agent.ResumeSandbox(ctx, connect.NewRequest(&pb.ResumeSandboxRequest{
|
||||
resp, err := h.agent.ResumeSandbox(ctx, connect.NewRequest(&pb.ResumeSandboxRequest{
|
||||
SandboxId: sandboxID,
|
||||
})); err != nil {
|
||||
}))
|
||||
if err != nil {
|
||||
status, code, msg := agentErrToHTTP(err)
|
||||
writeError(w, status, code, msg)
|
||||
return
|
||||
}
|
||||
|
||||
sb, err = h.db.UpdateSandboxStatus(ctx, db.UpdateSandboxStatusParams{
|
||||
ID: sandboxID, Status: "running",
|
||||
now := time.Now()
|
||||
sb, err = h.db.UpdateSandboxRunning(ctx, db.UpdateSandboxRunningParams{
|
||||
ID: sandboxID,
|
||||
HostIp: resp.Msg.HostIp,
|
||||
GuestIp: "",
|
||||
StartedAt: pgtype.Timestamptz{
|
||||
Time: now,
|
||||
Valid: true,
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, "db_error", "failed to update status")
|
||||
|
||||
187
internal/api/handlers_snapshots.go
Normal file
187
internal/api/handlers_snapshots.go
Normal file
@ -0,0 +1,187 @@
|
||||
package api
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"log/slog"
|
||||
"net/http"
|
||||
"time"
|
||||
|
||||
"connectrpc.com/connect"
|
||||
"github.com/go-chi/chi/v5"
|
||||
"github.com/jackc/pgx/v5/pgtype"
|
||||
|
||||
"git.omukk.dev/wrenn/sandbox/internal/db"
|
||||
"git.omukk.dev/wrenn/sandbox/internal/id"
|
||||
pb "git.omukk.dev/wrenn/sandbox/proto/hostagent/gen"
|
||||
"git.omukk.dev/wrenn/sandbox/proto/hostagent/gen/hostagentv1connect"
|
||||
)
|
||||
|
||||
type snapshotHandler struct {
|
||||
db *db.Queries
|
||||
agent hostagentv1connect.HostAgentServiceClient
|
||||
}
|
||||
|
||||
func newSnapshotHandler(db *db.Queries, agent hostagentv1connect.HostAgentServiceClient) *snapshotHandler {
|
||||
return &snapshotHandler{db: db, agent: agent}
|
||||
}
|
||||
|
||||
type createSnapshotRequest struct {
|
||||
SandboxID string `json:"sandbox_id"`
|
||||
Name string `json:"name"`
|
||||
}
|
||||
|
||||
type snapshotResponse struct {
|
||||
Name string `json:"name"`
|
||||
Type string `json:"type"`
|
||||
VCPUs *int32 `json:"vcpus,omitempty"`
|
||||
MemoryMB *int32 `json:"memory_mb,omitempty"`
|
||||
SizeBytes int64 `json:"size_bytes"`
|
||||
CreatedAt string `json:"created_at"`
|
||||
}
|
||||
|
||||
func templateToResponse(t db.Template) snapshotResponse {
|
||||
resp := snapshotResponse{
|
||||
Name: t.Name,
|
||||
Type: t.Type,
|
||||
SizeBytes: t.SizeBytes,
|
||||
}
|
||||
if t.Vcpus.Valid {
|
||||
resp.VCPUs = &t.Vcpus.Int32
|
||||
}
|
||||
if t.MemoryMb.Valid {
|
||||
resp.MemoryMB = &t.MemoryMb.Int32
|
||||
}
|
||||
if t.CreatedAt.Valid {
|
||||
resp.CreatedAt = t.CreatedAt.Time.Format(time.RFC3339)
|
||||
}
|
||||
return resp
|
||||
}
|
||||
|
||||
// Create handles POST /v1/snapshots.
|
||||
func (h *snapshotHandler) Create(w http.ResponseWriter, r *http.Request) {
|
||||
var req createSnapshotRequest
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
writeError(w, http.StatusBadRequest, "invalid_request", "invalid JSON body")
|
||||
return
|
||||
}
|
||||
|
||||
if req.SandboxID == "" {
|
||||
writeError(w, http.StatusBadRequest, "invalid_request", "sandbox_id is required")
|
||||
return
|
||||
}
|
||||
|
||||
if req.Name == "" {
|
||||
req.Name = id.NewSnapshotName()
|
||||
}
|
||||
|
||||
ctx := r.Context()
|
||||
overwrite := r.URL.Query().Get("overwrite") == "true"
|
||||
|
||||
// Check if name already exists.
|
||||
if _, err := h.db.GetTemplate(ctx, req.Name); err == nil {
|
||||
if !overwrite {
|
||||
writeError(w, http.StatusConflict, "already_exists", "snapshot name already exists; use ?overwrite=true to replace")
|
||||
return
|
||||
}
|
||||
// Delete existing template record and files.
|
||||
h.db.DeleteTemplate(ctx, req.Name)
|
||||
}
|
||||
|
||||
// Verify sandbox exists and is running or paused.
|
||||
sb, err := h.db.GetSandbox(ctx, req.SandboxID)
|
||||
if err != nil {
|
||||
writeError(w, http.StatusNotFound, "not_found", "sandbox not found")
|
||||
return
|
||||
}
|
||||
if sb.Status != "running" && sb.Status != "paused" {
|
||||
writeError(w, http.StatusConflict, "invalid_state", "sandbox must be running or paused")
|
||||
return
|
||||
}
|
||||
|
||||
// Call host agent to create snapshot. If running, the agent pauses it first.
|
||||
// The sandbox remains paused after this call.
|
||||
resp, err := h.agent.CreateSnapshot(ctx, connect.NewRequest(&pb.CreateSnapshotRequest{
|
||||
SandboxId: req.SandboxID,
|
||||
Name: req.Name,
|
||||
}))
|
||||
if err != nil {
|
||||
status, code, msg := agentErrToHTTP(err)
|
||||
writeError(w, status, code, msg)
|
||||
return
|
||||
}
|
||||
|
||||
// Mark sandbox as paused (if it was running, it got paused by the snapshot).
|
||||
if sb.Status != "paused" {
|
||||
if _, err := h.db.UpdateSandboxStatus(ctx, db.UpdateSandboxStatusParams{
|
||||
ID: req.SandboxID, Status: "paused",
|
||||
}); err != nil {
|
||||
slog.Error("failed to update sandbox status after snapshot", "sandbox_id", req.SandboxID, "error", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Insert template record.
|
||||
tmpl, err := h.db.InsertTemplate(ctx, db.InsertTemplateParams{
|
||||
Name: req.Name,
|
||||
Type: "snapshot",
|
||||
Vcpus: pgtype.Int4{Int32: sb.Vcpus, Valid: true},
|
||||
MemoryMb: pgtype.Int4{Int32: sb.MemoryMb, Valid: true},
|
||||
SizeBytes: resp.Msg.SizeBytes,
|
||||
})
|
||||
if err != nil {
|
||||
slog.Error("failed to insert template record", "name", req.Name, "error", err)
|
||||
writeError(w, http.StatusInternalServerError, "db_error", "snapshot created but failed to record in database")
|
||||
return
|
||||
}
|
||||
|
||||
writeJSON(w, http.StatusCreated, templateToResponse(tmpl))
|
||||
}
|
||||
|
||||
// List handles GET /v1/snapshots.
|
||||
func (h *snapshotHandler) List(w http.ResponseWriter, r *http.Request) {
|
||||
ctx := r.Context()
|
||||
typeFilter := r.URL.Query().Get("type")
|
||||
|
||||
var templates []db.Template
|
||||
var err error
|
||||
if typeFilter != "" {
|
||||
templates, err = h.db.ListTemplatesByType(ctx, typeFilter)
|
||||
} else {
|
||||
templates, err = h.db.ListTemplates(ctx)
|
||||
}
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, "db_error", "failed to list templates")
|
||||
return
|
||||
}
|
||||
|
||||
resp := make([]snapshotResponse, len(templates))
|
||||
for i, t := range templates {
|
||||
resp[i] = templateToResponse(t)
|
||||
}
|
||||
|
||||
writeJSON(w, http.StatusOK, resp)
|
||||
}
|
||||
|
||||
// Delete handles DELETE /v1/snapshots/{name}.
|
||||
func (h *snapshotHandler) Delete(w http.ResponseWriter, r *http.Request) {
|
||||
name := chi.URLParam(r, "name")
|
||||
ctx := r.Context()
|
||||
|
||||
if _, err := h.db.GetTemplate(ctx, name); err != nil {
|
||||
writeError(w, http.StatusNotFound, "not_found", "template not found")
|
||||
return
|
||||
}
|
||||
|
||||
// Delete files on host agent.
|
||||
if _, err := h.agent.DeleteSnapshot(ctx, connect.NewRequest(&pb.DeleteSnapshotRequest{
|
||||
Name: name,
|
||||
})); err != nil {
|
||||
slog.Warn("delete snapshot: agent RPC failed", "name", name, "error", err)
|
||||
}
|
||||
|
||||
if err := h.db.DeleteTemplate(ctx, name); err != nil {
|
||||
writeError(w, http.StatusInternalServerError, "db_error", "failed to delete template record")
|
||||
return
|
||||
}
|
||||
|
||||
w.WriteHeader(http.StatusNoContent)
|
||||
}
|
||||
@ -126,9 +126,13 @@ paths:
|
||||
post:
|
||||
summary: Pause a running sandbox
|
||||
operationId: pauseSandbox
|
||||
description: |
|
||||
Takes a snapshot of the sandbox (VM state + memory + rootfs), then
|
||||
destroys all running resources. The sandbox exists only as files on
|
||||
disk and can be resumed later.
|
||||
responses:
|
||||
"200":
|
||||
description: Sandbox paused
|
||||
description: Sandbox paused (snapshot taken, resources released)
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
@ -151,9 +155,13 @@ paths:
|
||||
post:
|
||||
summary: Resume a paused sandbox
|
||||
operationId: resumeSandbox
|
||||
description: |
|
||||
Restores a paused sandbox from its snapshot using UFFD for lazy
|
||||
memory loading. Boots a fresh Firecracker process, sets up a new
|
||||
network slot, and waits for envd to become ready.
|
||||
responses:
|
||||
"200":
|
||||
description: Sandbox resumed
|
||||
description: Sandbox resumed (new VM booted from snapshot)
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
@ -165,6 +173,85 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
|
||||
/v1/snapshots:
|
||||
post:
|
||||
summary: Create a snapshot template
|
||||
operationId: createSnapshot
|
||||
description: |
|
||||
Pauses a running sandbox, takes a full snapshot, copies the snapshot
|
||||
files to the images directory as a reusable template, then destroys
|
||||
the sandbox. The template can be used to create new sandboxes.
|
||||
parameters:
|
||||
- name: overwrite
|
||||
in: query
|
||||
required: false
|
||||
schema:
|
||||
type: string
|
||||
enum: ["true"]
|
||||
description: Set to "true" to overwrite an existing snapshot with the same name.
|
||||
requestBody:
|
||||
required: true
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/CreateSnapshotRequest"
|
||||
responses:
|
||||
"201":
|
||||
description: Snapshot created
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Template"
|
||||
"409":
|
||||
description: Name already exists or sandbox not running
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
|
||||
get:
|
||||
summary: List templates
|
||||
operationId: listSnapshots
|
||||
parameters:
|
||||
- name: type
|
||||
in: query
|
||||
required: false
|
||||
schema:
|
||||
type: string
|
||||
enum: [base, snapshot]
|
||||
description: Filter by template type.
|
||||
responses:
|
||||
"200":
|
||||
description: List of templates
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
type: array
|
||||
items:
|
||||
$ref: "#/components/schemas/Template"
|
||||
|
||||
/v1/snapshots/{name}:
|
||||
parameters:
|
||||
- name: name
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
|
||||
delete:
|
||||
summary: Delete a snapshot template
|
||||
operationId: deleteSnapshot
|
||||
description: Removes the snapshot files from disk and deletes the database record.
|
||||
responses:
|
||||
"204":
|
||||
description: Snapshot deleted
|
||||
"404":
|
||||
description: Template not found
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
|
||||
/v1/sandboxes/{id}/files/write:
|
||||
parameters:
|
||||
- name: id
|
||||
@ -429,6 +516,38 @@ components:
|
||||
type: string
|
||||
format: date-time
|
||||
|
||||
CreateSnapshotRequest:
|
||||
type: object
|
||||
required: [sandbox_id]
|
||||
properties:
|
||||
sandbox_id:
|
||||
type: string
|
||||
description: ID of the running sandbox to snapshot.
|
||||
name:
|
||||
type: string
|
||||
description: Name for the snapshot template. Auto-generated if omitted.
|
||||
|
||||
Template:
|
||||
type: object
|
||||
properties:
|
||||
name:
|
||||
type: string
|
||||
type:
|
||||
type: string
|
||||
enum: [base, snapshot]
|
||||
vcpus:
|
||||
type: integer
|
||||
nullable: true
|
||||
memory_mb:
|
||||
type: integer
|
||||
nullable: true
|
||||
size_bytes:
|
||||
type: integer
|
||||
format: int64
|
||||
created_at:
|
||||
type: string
|
||||
format: date-time
|
||||
|
||||
ExecRequest:
|
||||
type: object
|
||||
required: [cmd]
|
||||
|
||||
@ -62,10 +62,12 @@ func (rc *Reconciler) reconcile(ctx context.Context) {
|
||||
alive[sb.SandboxId] = struct{}{}
|
||||
}
|
||||
|
||||
// Get all DB sandboxes for this host that are running or paused.
|
||||
// Get all DB sandboxes for this host that are running.
|
||||
// Paused sandboxes are excluded: they are expected to not exist on the
|
||||
// host agent because pause = snapshot + destroy resources.
|
||||
dbSandboxes, err := rc.db.ListSandboxesByHostAndStatus(ctx, db.ListSandboxesByHostAndStatusParams{
|
||||
HostID: rc.hostID,
|
||||
Column2: []string{"running", "paused"},
|
||||
Column2: []string{"running"},
|
||||
})
|
||||
if err != nil {
|
||||
slog.Warn("reconciler: failed to list DB sandboxes", "error", err)
|
||||
|
||||
@ -29,6 +29,7 @@ func New(queries *db.Queries, agent hostagentv1connect.HostAgentServiceClient) *
|
||||
execStream := newExecStreamHandler(queries, agent)
|
||||
files := newFilesHandler(queries, agent)
|
||||
filesStream := newFilesStreamHandler(queries, agent)
|
||||
snapshots := newSnapshotHandler(queries, agent)
|
||||
|
||||
// OpenAPI spec and docs.
|
||||
r.Get("/openapi.yaml", serveOpenAPI)
|
||||
@ -53,6 +54,13 @@ func New(queries *db.Queries, agent hostagentv1connect.HostAgentServiceClient) *
|
||||
})
|
||||
})
|
||||
|
||||
// Snapshot / template management.
|
||||
r.Route("/v1/snapshots", func(r chi.Router) {
|
||||
r.Post("/", snapshots.Create)
|
||||
r.Get("/", snapshots.List)
|
||||
r.Delete("/{name}", snapshots.Delete)
|
||||
})
|
||||
|
||||
return &Server{router: r}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user