Add pre-pause proxy connection drain and sandbox proxy caching

Introduce ConnTracker (atomic.Bool + WaitGroup) to track in-flight proxy connections per sandbox. Before pausing a VM, the manager drains active connections with a 2s grace period, preventing Go runtime corruption inside the guest caused by stale TCP state surviving Firecracker snapshot/restore. Also add: - AcquireProxyConn on Manager for atomic lookup + connection tracking - Proxy cache (120s TTL) on CP SandboxProxyWrapper with single-query DB lookup (GetSandboxProxyTarget) to avoid two round-trips - Reset() on ConnTracker to re-enable connections if pause fails
2026-04-01 15:09:44 +06:00
parent 377e856c8f
commit 2b4c5e0176
7 changed files with 253 additions and 54 deletions
--- a/internal/sandbox/conntracker.go
+++ b/internal/sandbox/conntracker.go
@ -0,0 +1,66 @@
+package sandbox
+
+import (
+	"sync"
+	"sync/atomic"
+	"time"
+)
+
+// ConnTracker tracks active proxy connections for a single sandbox and
+// provides a drain mechanism for pre-pause graceful shutdown.
+// It is safe for concurrent use.
+type ConnTracker struct {
+	draining atomic.Bool
+	wg       sync.WaitGroup
+}
+
+// Acquire registers one in-flight connection. Returns false if the tracker
+// is already draining; the caller must not call Release in that case.
+func (t *ConnTracker) Acquire() bool {
+	if t.draining.Load() {
+		return false
+	}
+	t.wg.Add(1)
+	// Re-check after Add: Drain may have set draining between our Load
+	// and Add. If so, undo the Add and reject the connection.
+	if t.draining.Load() {
+		t.wg.Done()
+		return false
+	}
+	return true
+}
+
+// Release marks one connection as complete. Must be called exactly once
+// per successful Acquire.
+func (t *ConnTracker) Release() {
+	t.wg.Done()
+}
+
+// Drain marks the tracker as draining (all future Acquire calls return
+// false) and waits up to timeout for in-flight connections to finish.
+//
+// Note: if the timeout expires with connections still in-flight, the
+// internal goroutine waiting on wg.Wait() will remain until those
+// connections complete. This is bounded by the number of hung connections
+// at drain time and self-heals once they close.
+func (t *ConnTracker) Drain(timeout time.Duration) {
+	t.draining.Store(true)
+
+	done := make(chan struct{})
+	go func() {
+		t.wg.Wait()
+		close(done)
+	}()
+
+	select {
+	case <-done:
+	case <-time.After(timeout):
+	}
+}
+
+// Reset re-enables the tracker after a failed drain. This allows the
+// sandbox to accept proxy connections again if the pause operation fails
+// and the VM is resumed.
+func (t *ConnTracker) Reset() {
+	t.draining.Store(false)
+}
--- a/internal/sandbox/manager.go
+++ b/internal/sandbox/manager.go
@ -4,6 +4,7 @@ import (
 	"context"
 	"fmt"
 	"log/slog"
+	"net"
 	"os"
 	"os/exec"
 	"path/filepath"
@ -50,7 +51,8 @@ type sandboxState struct {
 	models.Sandbox
 	slot           *network.Slot
 	client         *envdclient.Client
-	uffdSocketPath string // non-empty for sandboxes restored from snapshot
+	connTracker    *ConnTracker // tracks in-flight proxy connections for pre-pause drain
+	uffdSocketPath string       // non-empty for sandboxes restored from snapshot
 	dmDevice       *devicemapper.SnapshotDevice
 	baseImagePath  string // path to the base template rootfs (for loop registry release)

@ -224,6 +226,7 @@ func (m *Manager) Create(ctx context.Context, sandboxID string, teamID, template
 		},
 		slot:          slot,
 		client:        client,
+		connTracker:   &ConnTracker{},
 		dmDevice:      dmDev,
 		baseImagePath: baseRootfs,
 	}
@ -308,10 +311,17 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error {
 		return fmt.Errorf("sandbox %s is not running (status: %s)", sandboxID, sb.Status)
 	}

+	// Step 0: Drain in-flight proxy connections before freezing vCPUs.
+	// This prevents Go runtime corruption inside the guest caused by stale
+	// TCP state from connections that were alive when the VM was snapshotted.
+	sb.connTracker.Drain(2 * time.Second)
+	slog.Debug("pause: proxy connections drained", "id", sandboxID)
+
 	pauseStart := time.Now()

 	// Step 1: Pause the VM (freeze vCPUs).
 	if err := m.vm.Pause(ctx, sandboxID); err != nil {
+		sb.connTracker.Reset()
 		return fmt.Errorf("pause VM: %w", err)
 	}
 	slog.Debug("pause: VM paused", "id", sandboxID, "elapsed", time.Since(pauseStart))
@ -326,8 +336,10 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error {

 	// resumeOnError unpauses the VM so the sandbox stays usable when a
 	// post-freeze step fails. If the resume itself fails, the sandbox is
-	// left frozen — the caller should destroy it.
+	// left frozen — the caller should destroy it. It also resets the
+	// connection tracker so the sandbox can accept proxy connections again.
 	resumeOnError := func() {
+		sb.connTracker.Reset()
 		if err := m.vm.Resume(ctx, sandboxID); err != nil {
 			slog.Error("failed to resume VM after pause error — sandbox is frozen", "id", sandboxID, "error", err)
 		}
@ -692,6 +704,7 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string, timeoutSec int)
 		},
 		slot:           slot,
 		client:         client,
+		connTracker:    &ConnTracker{},
 		uffdSocketPath: uffdSocketPath,
 		dmDevice:       dmDev,
 		baseImagePath:  baseImagePath,
@ -1094,6 +1107,7 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID string, team
 		},
 		slot:           slot,
 		client:         client,
+		connTracker:    &ConnTracker{},
 		uffdSocketPath: uffdSocketPath,
 		dmDevice:       dmDev,
 		baseImagePath:  baseRootfs,
@ -1190,6 +1204,25 @@ func (m *Manager) GetClient(sandboxID string) (*envdclient.Client, error) {
 	return sb.client, nil
 }

+// AcquireProxyConn atomically looks up a sandbox by ID and registers an
+// in-flight proxy connection. Returns the sandbox's host-reachable IP, the
+// connection tracker, and true on success. The caller must call
+// tracker.Release() when the request completes. Returns zero values and
+// false if the sandbox is not found, not running, or is draining for a pause.
+func (m *Manager) AcquireProxyConn(sandboxID string) (net.IP, *ConnTracker, bool) {
+	m.mu.RLock()
+	sb, ok := m.boxes[sandboxID]
+	m.mu.RUnlock()
+
+	if !ok || sb.Status != models.StatusRunning {
+		return nil, nil, false
+	}
+	if !sb.connTracker.Acquire() {
+		return nil, nil, false
+	}
+	return sb.HostIP, sb.connTracker, true
+}
+
 // Ping resets the inactivity timer for a running sandbox.
 func (m *Manager) Ping(sandboxID string) error {
 	m.mu.Lock()