Add sandbox snapshot and restore with UFFD lazy memory loading

Implement full snapshot lifecycle: pause (snapshot + free resources),
resume (UFFD-based lazy restore), and named snapshot templates that
can spawn new sandboxes from frozen VM state.

Key changes:
- Snapshot header system with generational diff mapping (inspired by e2b)
- UFFD server for lazy page fault handling during snapshot restore
- Stable rootfs symlink path (/tmp/fc-vm/) for snapshot compatibility
- Templates DB table and CRUD API endpoints (POST/GET/DELETE /v1/snapshots)
- CreateSnapshot/DeleteSnapshot RPCs in hostagent proto
- Reconciler excludes paused sandboxes (expected absent from host agent)
- Snapshot templates lock vcpus/memory to baked-in values
- Proper cleanup of uffd sockets and pause snapshot files on destroy
This commit is contained in:
2026-03-12 09:19:37 +06:00
parent 9b94df7f56
commit a1bd439c75
33 changed files with 2714 additions and 166 deletions

View File

@ -97,6 +97,17 @@ func (h *sandboxHandler) Create(w http.ResponseWriter, r *http.Request) {
}
ctx := r.Context()
// If the template is a snapshot, use its baked-in vcpus/memory
// (they cannot be changed since the VM state is frozen).
if tmpl, err := h.db.GetTemplate(ctx, req.Template); err == nil && tmpl.Type == "snapshot" {
if tmpl.Vcpus.Valid {
req.VCPUs = tmpl.Vcpus.Int32
}
if tmpl.MemoryMb.Valid {
req.MemoryMB = tmpl.MemoryMb.Int32
}
}
sandboxID := id.NewSandboxID()
// Insert pending record.
@ -182,6 +193,8 @@ func (h *sandboxHandler) Get(w http.ResponseWriter, r *http.Request) {
}
// Pause handles POST /v1/sandboxes/{id}/pause.
// Pause = snapshot + destroy. The sandbox is frozen to disk and all running
// resources are released. It can be resumed later.
func (h *sandboxHandler) Pause(w http.ResponseWriter, r *http.Request) {
sandboxID := chi.URLParam(r, "id")
ctx := r.Context()
@ -216,6 +229,7 @@ func (h *sandboxHandler) Pause(w http.ResponseWriter, r *http.Request) {
}
// Resume handles POST /v1/sandboxes/{id}/resume.
// Resume restores a paused sandbox from snapshot using UFFD lazy memory loading.
func (h *sandboxHandler) Resume(w http.ResponseWriter, r *http.Request) {
sandboxID := chi.URLParam(r, "id")
ctx := r.Context()
@ -230,16 +244,24 @@ func (h *sandboxHandler) Resume(w http.ResponseWriter, r *http.Request) {
return
}
if _, err := h.agent.ResumeSandbox(ctx, connect.NewRequest(&pb.ResumeSandboxRequest{
resp, err := h.agent.ResumeSandbox(ctx, connect.NewRequest(&pb.ResumeSandboxRequest{
SandboxId: sandboxID,
})); err != nil {
}))
if err != nil {
status, code, msg := agentErrToHTTP(err)
writeError(w, status, code, msg)
return
}
sb, err = h.db.UpdateSandboxStatus(ctx, db.UpdateSandboxStatusParams{
ID: sandboxID, Status: "running",
now := time.Now()
sb, err = h.db.UpdateSandboxRunning(ctx, db.UpdateSandboxRunningParams{
ID: sandboxID,
HostIp: resp.Msg.HostIp,
GuestIp: "",
StartedAt: pgtype.Timestamptz{
Time: now,
Valid: true,
},
})
if err != nil {
writeError(w, http.StatusInternalServerError, "db_error", "failed to update status")