fix: prevent Go runtime memory corruption and sandbox halt after snapshot restore

Three root causes addressed: 1. Go page allocator corruption: allocations between the pre-snapshot GC and VM freeze leave the summary tree inconsistent. After restore, GC reads corrupted metadata — either panicking (killing PID 1 → kernel panic) or silently failing to collect, causing unbounded heap growth until OOM. Fix: move GC to after all HTTP allocations in PostSnapshotPrepare, then set GOMAXPROCS(1) so any remaining allocations run sequentially with no concurrent page allocator access. GOMAXPROCS is restored on first health check after restore. 2. PostInit timeout starvation: WaitUntilReady and PostInit shared a single 30s context. If WaitUntilReady consumed most of it, PostInit failed — RestoreAfterSnapshot never ran, leaving envd with keep-alives disabled and zombie connections. Fix: separate timeout contexts. 3. CP HTTP server missing timeouts: no ReadHeaderTimeout or IdleTimeout caused goroutine leaks from hung proxy connections. Fix: add both, matching host agent values. Also adds UFFD prefetch to proactively load all guest pages after restore, eliminating on-demand page fault latency for subsequent RPC calls.
2026-05-02 17:22:51 +06:00
parent bb582deefa
commit 3deecbff89
13 changed files with 245 additions and 28 deletions
--- a/.gitignore
+++ b/.gitignore
@ -36,6 +36,7 @@ go.work.sum
 e2b/
 .impeccable.md
 .gstack
+.mcp.json

 ## Builds
 builds/
@ -49,3 +50,5 @@ frontend/build/
 internal/dashboard/static/*
 !internal/dashboard/static/.gitkeep.dual-graph/
 .dual-graph/
+# Added by code-review-graph
+.code-review-graph/
--- a/CLAUDE.md
+++ b/CLAUDE.md
@ -372,3 +372,42 @@ All values are CSS custom properties in `frontend/src/app.css`.
 4. **Legible at speed.** Users scan dashboards in seconds. Strong typographic contrast (serif h1, mono IDs, sans body), consistent patterns, and predictable placement let users orientate instantly without reading everything.

 5. **Craft signals trust.** For infrastructure that runs production code, the quality of the UI is a proxy for the quality of the product. Pixel-level decisions matter. Polish is not decoration — it's a trust signal.
+
+<!-- code-review-graph MCP tools -->
+## MCP Tools: code-review-graph
+
+**IMPORTANT: This project has a knowledge graph. ALWAYS use the
+code-review-graph MCP tools BEFORE using Grep/Glob/Read to explore
+the codebase.** The graph is faster, cheaper (fewer tokens), and gives
+you structural context (callers, dependents, test coverage) that file
+scanning cannot.
+
+### When to use graph tools FIRST
+
+- **Exploring code**: `semantic_search_nodes` or `query_graph` instead of Grep
+- **Understanding impact**: `get_impact_radius` instead of manually tracing imports
+- **Code review**: `detect_changes` + `get_review_context` instead of reading entire files
+- **Finding relationships**: `query_graph` with callers_of/callees_of/imports_of/tests_for
+- **Architecture questions**: `get_architecture_overview` + `list_communities`
+
+Fall back to Grep/Glob/Read **only** when the graph doesn't cover what you need.
+
+### Key Tools
+
+| Tool | Use when |
+|------|----------|
+| `detect_changes` | Reviewing code changes — gives risk-scored analysis |
+| `get_review_context` | Need source snippets for review — token-efficient |
+| `get_impact_radius` | Understanding blast radius of a change |
+| `get_affected_flows` | Finding which execution paths are impacted |
+| `query_graph` | Tracing callers, callees, imports, tests, dependencies |
+| `semantic_search_nodes` | Finding functions/classes by name or keyword |
+| `get_architecture_overview` | Understanding high-level codebase structure |
+| `refactor_tool` | Planning renames, finding dead code |
+
+### Workflow
+
+1. The graph auto-updates on file changes (via hooks).
+2. Use `detect_changes` for code review.
+3. Use `get_affected_flows` to understand impact.
+4. Use `query_graph` pattern="tests_for" to check coverage.
--- a/2
+++ b/2
@ -1 +1 @@
-0.1.1
+0.1.2
--- a/2
+++ b/2
@ -1 +1 @@
-0.1.4
+0.1.5
--- a/envd/VERSION
+++ b/envd/VERSION
@ -1 +1 @@
-0.1.1
+0.1.2
--- a/envd/internal/api/conntracker.go
+++ b/envd/internal/api/conntracker.go
@ -51,7 +51,7 @@ func (t *ServerConnTracker) Track(conn net.Conn, state http.ConnState) {
 // (with keep-alives disabled, the connection closes), RestoreAfterSnapshot
 // will close any that survived into the snapshot as zombie TCP sockets.
 //
-// GC cycles are handled by PortSubsystem.Stop() which runs before this.
+// GC is handled by PostSnapshotPrepare after this returns.
 func (t *ServerConnTracker) PrepareForSnapshot() {
 	t.mu.Lock()
 	defer t.mu.Unlock()
--- a/envd/internal/api/init.go
+++ b/envd/internal/api/init.go
@ -150,15 +150,17 @@ func (a *API) PostInit(w http.ResponseWriter, r *http.Request) {
 		host.PollForMMDSOpts(ctx, a.mmdsChan, a.defaults.EnvVars)
 	}()

-	// Close zombie connections from before the snapshot and re-enable
-	// keep-alives. On first boot this is a no-op (no zombie connections).
+	// Safety net: if the health check's postRestoreRecovery didn't run yet
+	// (e.g. PostInit arrived before the first health check), re-enable GC
+	// here. On first boot needsRestore is false so CAS is a no-op.
+	if a.needsRestore.CompareAndSwap(true, false) {
+		a.postRestoreRecovery()
+	}
+	// RestoreAfterSnapshot is idempotent (clears preSnapshot set), and
+	// Start is a no-op if already running.
 	if a.connTracker != nil {
 		a.connTracker.RestoreAfterSnapshot()
 	}
-
-	// Start the port scanner and forwarder if they were stopped by a
-	// pre-snapshot prepare call. Start is a no-op if already running,
-	// so this is safe on first boot and only takes effect after restore.
 	if a.portSubsystem != nil {
 		a.portSubsystem.Start(a.rootCtx)
 	}
--- a/envd/internal/api/snapshot.go
+++ b/envd/internal/api/snapshot.go
@ -5,6 +5,8 @@ package api

 import (
 	"net/http"
+	"runtime"
+	"runtime/debug"
 )

 // PostSnapshotPrepare quiesces continuous goroutines (port scanner, forwarder),
@ -13,6 +15,14 @@ import (
 // TCP state after snapshot restore. Keep-alives are disabled so the current
 // request's connection also closes after the response.
 //
+// To prevent Go page allocator corruption, GOMAXPROCS is set to 1 after the
+// final GC. With a single P, all goroutines (including any that allocate
+// between now and the VM freeze) run sequentially. This eliminates concurrent
+// page allocator access, so even if the freeze lands mid-allocation, the
+// in-flight operation completes atomically on restore before any GC reads
+// the summary tree. GOMAXPROCS is restored on the first health check after
+// restore (see postRestoreRecovery).
+//
 // Called by the host agent as a best-effort signal before vm.Pause().
 func (a *API) PostSnapshotPrepare(w http.ResponseWriter, r *http.Request) {
 	defer r.Body.Close()
@ -27,6 +37,26 @@ func (a *API) PostSnapshotPrepare(w http.ResponseWriter, r *http.Request) {
 		a.logger.Info().Msg("snapshot/prepare: idle connections closed, keep-alives disabled")
 	}

+	// Send the response before the GC so HTTP buffer allocations happen
+	// while GOMAXPROCS is still at its normal value.
 	w.Header().Set("Cache-Control", "no-store")
 	w.WriteHeader(http.StatusNoContent)
+	if f, ok := w.(http.Flusher); ok {
+		f.Flush()
+	}
+
+	// Final GC pass after all major allocations (connection cleanup,
+	// response write) are complete.
+	runtime.GC()
+	runtime.GC()
+	debug.FreeOSMemory()
+
+	// Reduce to a single P so any post-GC allocations (HTTP server
+	// connection teardown) run sequentially — no concurrent page allocator
+	// access that could leave the summary tree inconsistent if the VM
+	// freezes mid-update.
+	a.prevGOMAXPROCS = runtime.GOMAXPROCS(1)
+
+	a.needsRestore.Store(true)
+	a.logger.Info().Msg("snapshot/prepare: GOMAXPROCS=1, ready for freeze")
 }
--- a/envd/internal/api/store.go
+++ b/envd/internal/api/store.go
@ -7,7 +7,10 @@ import (
 	"context"
 	"encoding/json"
 	"net/http"
+	"runtime"
+	"runtime/debug"
 	"sync"
+	"sync/atomic"

 	"github.com/rs/zerolog"

@ -48,6 +51,12 @@ type API struct {
 	rootCtx       context.Context
 	portSubsystem *publicport.PortSubsystem
 	connTracker   *ServerConnTracker
+
+	// needsRestore is set by PostSnapshotPrepare and cleared on the first
+	// health check or PostInit after restore. While set, GOMAXPROCS is 1
+	// to prevent concurrent page allocator access during the freeze window.
+	needsRestore   atomic.Bool
+	prevGOMAXPROCS int // GOMAXPROCS value before PrepareSnapshot reduced it to 1
 }

 func New(l *zerolog.Logger, defaults *execcontext.Defaults, mmdsChan chan *host.MMDSOpts, isNotFC bool, rootCtx context.Context, portSubsystem *publicport.PortSubsystem, connTracker *ServerConnTracker, version string) *API {
@ -69,6 +78,14 @@ func New(l *zerolog.Logger, defaults *execcontext.Defaults, mmdsChan chan *host.
 func (a *API) GetHealth(w http.ResponseWriter, r *http.Request) {
 	defer r.Body.Close()

+	// On the first health check after snapshot restore, re-enable GC and
+	// clean up stale state. By this point, any goroutine that was mid-
+	// allocation when the VM was frozen has completed, so the page allocator
+	// summary tree is consistent and safe for GC to read.
+	if a.needsRestore.CompareAndSwap(true, false) {
+		a.postRestoreRecovery()
+	}
+
 	a.logger.Trace().Msg("Health check")

 	w.Header().Set("Cache-Control", "no-store")
@ -79,6 +96,35 @@ func (a *API) GetHealth(w http.ResponseWriter, r *http.Request) {
 	})
 }

+// postRestoreRecovery restores GOMAXPROCS, runs a clean GC cycle, closes
+// zombie TCP connections from before the snapshot, re-enables HTTP keep-alives,
+// and restarts the port subsystem. Called exactly once per restore cycle,
+// guarded by a CAS on needsRestore in both GetHealth and PostInit.
+func (a *API) postRestoreRecovery() {
+	// Restore parallelism first — any goroutine that was mid-allocation
+	// when the VM froze has already completed by the time a health check
+	// or PostInit request is being served, so the page allocator summary
+	// tree is consistent and safe for a full GC.
+	prev := a.prevGOMAXPROCS
+	if prev > 0 {
+		runtime.GOMAXPROCS(prev)
+	}
+	runtime.GC()
+	runtime.GC()
+	debug.FreeOSMemory()
+	a.logger.Info().Msg("restore: GOMAXPROCS restored, GC complete")
+
+	if a.connTracker != nil {
+		a.connTracker.RestoreAfterSnapshot()
+		a.logger.Info().Msg("restore: zombie connections closed, keep-alives re-enabled")
+	}
+
+	if a.portSubsystem != nil {
+		a.portSubsystem.Start(a.rootCtx)
+		a.logger.Info().Msg("restore: port subsystem restarted")
+	}
+}
+
 func (a *API) GetMetrics(w http.ResponseWriter, r *http.Request) {
 	defer r.Body.Close()

--- a/envd/internal/port/subsystem.go
+++ b/envd/internal/port/subsystem.go
@ -5,8 +5,6 @@ package port

 import (
 	"context"
-	"runtime"
-	"runtime/debug"
 	"sync"
 	"time"

@ -72,9 +70,12 @@ func (p *PortSubsystem) Start(parentCtx context.Context) {
 	}()
 }

-// Stop quiesces the scanner and forwarder goroutines and forces a GC cycle
-// to put the Go runtime's page allocator in a consistent state before snapshot.
+// Stop quiesces the scanner and forwarder goroutines.
 // Blocks until both goroutines have exited. Safe to call if already stopped.
+//
+// GC is NOT run here — it is deferred to PostSnapshotPrepare so that the
+// GC happens after all allocations (connection cleanup, HTTP response) are
+// complete, minimizing the window where page allocator corruption can occur.
 func (p *PortSubsystem) Stop() {
 	p.mu.Lock()
 	if !p.running {
@ -90,12 +91,6 @@ func (p *PortSubsystem) Stop() {

 	cancelFn()
 	wg.Wait()
-
-	// Force two GC cycles to ensure all spans are swept and the page
-	// allocator summary tree is fully consistent before the VM is frozen.
-	runtime.GC()
-	runtime.GC()
-	debug.FreeOSMemory()
 }

 // Restart stops the subsystem (if running) and starts it again with a fresh
--- a/internal/sandbox/manager.go
+++ b/internal/sandbox/manager.go
@ -768,12 +768,17 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string, timeoutSec int,
 		return nil, fmt.Errorf("restore VM from snapshot: %w", err)
 	}

+	// Start prefetching all guest memory pages in the background.
+	// This runs concurrently with envd startup and eliminates on-demand
+	// page fault latency for subsequent RPC calls.
+	uffdServer.Prefetch()
+
 	// Wait for envd to be ready.
 	client := envdclient.New(slot.HostIP.String())
 	waitCtx, waitCancel := context.WithTimeout(ctx, m.cfg.EnvdTimeout)
-	defer waitCancel()

 	if err := client.WaitUntilReady(waitCtx); err != nil {
+		waitCancel()
 		warnErr("uffd server stop error", sandboxID, uffdServer.Stop())
 		source.Close()
 		warnErr("vm destroy error", sandboxID, m.vm.Destroy(context.Background(), sandboxID))
@ -784,9 +789,14 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string, timeoutSec int,
 		m.loops.Release(baseImagePath)
 		return nil, fmt.Errorf("wait for envd: %w", err)
 	}
+	waitCancel()

-	// Trigger envd to re-read MMDS and apply template defaults in a single call.
-	if err := client.PostInitWithDefaults(waitCtx, defaultUser, defaultEnv); err != nil {
+	// PostInit gets its own timeout — WaitUntilReady may have consumed most
+	// of EnvdTimeout, starving PostInit of time for RestoreAfterSnapshot.
+	initCtx, initCancel := context.WithTimeout(ctx, m.cfg.EnvdTimeout)
+	defer initCancel()
+
+	if err := client.PostInitWithDefaults(initCtx, defaultUser, defaultEnv); err != nil {
 		slog.Warn("post-init failed after resume, metadata files may be stale", "sandbox", sandboxID, "error", err)
 	}

@ -1200,12 +1210,15 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID string, team
 		return nil, fmt.Errorf("restore VM from snapshot: %w", err)
 	}

+	// Start prefetching all guest memory pages in the background.
+	uffdServer.Prefetch()
+
 	// Wait for envd.
 	client := envdclient.New(slot.HostIP.String())
 	waitCtx, waitCancel := context.WithTimeout(ctx, m.cfg.EnvdTimeout)
-	defer waitCancel()

 	if err := client.WaitUntilReady(waitCtx); err != nil {
+		waitCancel()
 		warnErr("uffd server stop error", sandboxID, uffdServer.Stop())
 		source.Close()
 		warnErr("vm destroy error", sandboxID, m.vm.Destroy(context.Background(), sandboxID))
@ -1216,9 +1229,14 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID string, team
 		m.loops.Release(baseRootfs)
 		return nil, fmt.Errorf("wait for envd: %w", err)
 	}
+	waitCancel()

-	// Trigger envd to re-read MMDS so it picks up the new sandbox/template IDs.
-	if err := client.PostInit(waitCtx); err != nil {
+	// PostInit gets its own timeout — WaitUntilReady may have consumed most
+	// of EnvdTimeout, starving PostInit of time for RestoreAfterSnapshot.
+	initCtx, initCancel := context.WithTimeout(ctx, m.cfg.EnvdTimeout)
+	defer initCancel()
+
+	if err := client.PostInit(initCtx); err != nil {
 		slog.Warn("post-init failed after template restore, metadata files may be stale", "sandbox", sandboxID, "error", err)
 	}

--- a/internal/uffd/server.go
+++ b/internal/uffd/server.go
@ -57,6 +57,17 @@ type Server struct {
 	// exitPipe signals the poll loop to stop.
 	exitR *os.File
 	exitW *os.File
+
+	// Set by handle() after Firecracker connects; read by Prefetch()
+	// after waiting on readyCh (which establishes happens-before).
+	uffdFd  fd
+	mapping *Mapping
+
+	// Prefetch lifecycle: cancel stops the goroutine, prefetchDone is
+	// closed when it exits. Stop() drains prefetchDone before returning
+	// so the caller can safely close diff file handles.
+	prefetchCancel context.CancelFunc
+	prefetchDone   chan struct{}
 }

 // NewServer creates a UFFD server that will listen on the given socket path
@ -113,10 +124,17 @@ func (s *Server) Ready() <-chan struct{} {
 }

 // Stop signals the UFFD poll loop to exit and waits for it to finish.
+// Also cancels and waits for any running prefetch goroutine.
 func (s *Server) Stop() error {
+	if s.prefetchCancel != nil {
+		s.prefetchCancel()
+	}
 	// Write a byte to the exit pipe to wake the poll loop.
 	_, _ = s.exitW.Write([]byte{0})
 	<-s.doneCh
+	if s.prefetchDone != nil {
+		<-s.prefetchDone
+	}
 	return s.doneErr
 }

@ -172,6 +190,10 @@ func (s *Server) handle(ctx context.Context) error {

 	mapping := NewMapping(regions)

+	// Store for use by Prefetch().
+	s.uffdFd = uffdFd
+	s.mapping = mapping
+
 	slog.Info("uffd handler connected",
 		"regions", len(regions),
 		"fd", int(uffdFd),
@ -294,6 +316,66 @@ func (s *Server) faultPage(ctx context.Context, uffdFd fd, addr uintptr, offset
 	return nil
 }

+// Prefetch proactively loads all guest memory pages in the background.
+// It iterates over every page in every UFFD region and copies it from the
+// diff file into guest memory via UFFDIO_COPY. Pages already loaded by
+// on-demand faults return nil from faultPage (EEXIST handled internally).
+// This eliminates the per-request latency caused by lazy page faulting
+// after snapshot restore.
+//
+// The goroutine blocks on readyCh before reading the uffd fd and mapping
+// fields (establishes happens-before with handle()). It uses an internal
+// context independent of the caller's RPC context so it survives after the
+// create/resume RPC returns. Stop() cancels and joins the goroutine.
+func (s *Server) Prefetch() {
+	ctx, cancel := context.WithCancel(context.Background())
+	s.prefetchCancel = cancel
+	s.prefetchDone = make(chan struct{})
+
+	go func() {
+		defer close(s.prefetchDone)
+
+		// Wait for Firecracker to connect and send the uffd fd.
+		select {
+		case <-s.readyCh:
+		case <-ctx.Done():
+			return
+		}
+
+		uffdFd := s.uffdFd
+		mapping := s.mapping
+		if mapping == nil {
+			return
+		}
+
+		var total, errored int
+		for _, region := range mapping.Regions {
+			pageSize := region.PageSize
+			if pageSize == 0 {
+				continue
+			}
+			for off := uintptr(0); off < region.Size; off += pageSize {
+				if ctx.Err() != nil {
+					slog.Debug("uffd prefetch cancelled",
+						"pages", total, "errors", errored)
+					return
+				}
+
+				addr := region.BaseHostVirtAddr + off
+				memOffset := int64(off) + int64(region.Offset)
+
+				if err := s.faultPage(ctx, uffdFd, addr, memOffset, pageSize); err != nil {
+					errored++
+				} else {
+					total++
+				}
+			}
+		}
+		slog.Info("uffd prefetch complete",
+			"pages", total, "errors", errored)
+	}()
+}
+
 // DiffFileSource serves pages from a snapshot's compact diff file using
 // the header's block mapping to resolve offsets.
 type DiffFileSource struct {
--- a/pkg/cpserver/run.go
+++ b/pkg/cpserver/run.go
@ -258,6 +258,8 @@ func Run(opts ...Option) {
 	httpServer := &http.Server{
 		Addr:              cfg.ListenAddr,
 		Handler:           proxyWrapper,
+		ReadHeaderTimeout: 10 * time.Second,
+		IdleTimeout:       620 * time.Second, // > typical LB/Cloudflare upstream timeout
 	}

 	// Graceful shutdown on signal.
 @ -1 +1 @@
 .1.1
 .1.2
 @ -1 +1 @@
 .1.4
 .1.5