Fix review findings: IP collision, pause race, proxy path, ENV ordering, conn drain

- Fix IP address collision at slot 32768+ by using bitwise shifts instead of byte-truncating division in network slot addressing - Add per-sandbox lifecycleMu to serialize concurrent Pause/Destroy calls - Sanitize proxy forwarding path with path.Clean - Sort ENV keys in recipe shell preamble for deterministic ordering - Fix ConnTracker goroutine leak by adding cancel channel to Drain/Reset - Update context_test to assert deterministic ENV ordering
2026-04-08 04:32:41 +06:00
parent dd50cfdcb1
commit e3ffa576ce
6 changed files with 53 additions and 21 deletions
--- a/internal/sandbox/conntracker.go
+++ b/internal/sandbox/conntracker.go
@ -12,6 +12,11 @@ import (
 type ConnTracker struct {
 	draining atomic.Bool
 	wg       sync.WaitGroup
+
+	// cancelMu protects cancelDrain so Reset can signal a timed-out Drain
+	// goroutine to exit, preventing goroutine leaks on repeated pause failures.
+	cancelMu    sync.Mutex
+	cancelDrain chan struct{}
 }

 // Acquire registers one in-flight connection. Returns false if the tracker
@ -38,14 +43,14 @@ func (t *ConnTracker) Release() {

 // Drain marks the tracker as draining (all future Acquire calls return
 // false) and waits up to timeout for in-flight connections to finish.
-//
-// Note: if the timeout expires with connections still in-flight, the
-// internal goroutine waiting on wg.Wait() will remain until those
-// connections complete. This is bounded by the number of hung connections
-// at drain time and self-heals once they close.
 func (t *ConnTracker) Drain(timeout time.Duration) {
 	t.draining.Store(true)

+	cancel := make(chan struct{})
+	t.cancelMu.Lock()
+	t.cancelDrain = cancel
+	t.cancelMu.Unlock()
+
 	done := make(chan struct{})
 	go func() {
 		t.wg.Wait()
@ -54,13 +59,27 @@ func (t *ConnTracker) Drain(timeout time.Duration) {

 	select {
 	case <-done:
+	case <-cancel:
+		// Reset was called; stop waiting.
 	case <-time.After(timeout):
 	}
 }

 // Reset re-enables the tracker after a failed drain. This allows the
 // sandbox to accept proxy connections again if the pause operation fails
-// and the VM is resumed.
+// and the VM is resumed. It also cancels any lingering Drain goroutine.
 func (t *ConnTracker) Reset() {
+	t.cancelMu.Lock()
+	if t.cancelDrain != nil {
+		select {
+		case <-t.cancelDrain:
+			// Already closed.
+		default:
+			close(t.cancelDrain)
+		}
+		t.cancelDrain = nil
+	}
+	t.cancelMu.Unlock()
+
 	t.draining.Store(false)
 }
--- a/internal/sandbox/manager.go
+++ b/internal/sandbox/manager.go
@ -49,6 +49,7 @@ type Manager struct {
 // sandboxState holds the runtime state for a single sandbox.
 type sandboxState struct {
 	models.Sandbox
+	lifecycleMu    sync.Mutex // serializes Pause/Destroy/Resume on this sandbox
 	slot           *network.Slot
 	client         *envdclient.Client
 	connTracker    *ConnTracker // tracks in-flight proxy connections for pre-pause drain
@ -259,6 +260,9 @@ func (m *Manager) Destroy(ctx context.Context, sandboxID string) error {
 	m.mu.Unlock()

 	if ok {
+		// Wait for any in-progress Pause to finish before tearing down resources.
+		sb.lifecycleMu.Lock()
+		defer sb.lifecycleMu.Unlock()
 		m.cleanup(ctx, sb)
 	}

@ -307,6 +311,11 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error {
 		return err
 	}

+	// Serialize lifecycle operations on this sandbox to prevent concurrent
+	// Pause/Destroy calls from corrupting Firecracker state.
+	sb.lifecycleMu.Lock()
+	defer sb.lifecycleMu.Unlock()
+
 	if sb.Status != models.StatusRunning {
 		return fmt.Errorf("sandbox %s is not running (status: %s)", sandboxID, sb.Status)
 	}