1
0
forked from wrenn/wrenn

fix: close stale TCP connections across snapshot/restore to prevent envd hangs

After Firecracker snapshot restore, zombie TCP sockets from the previous
session cause Go runtime corruption inside the guest VM, making envd
unresponsive. This manifests as infinite loading in the file browser and
terminal timeouts (524) in production (HTTP/2 + Cloudflare) but not locally.

Four-part fix:
- Add ServerConnTracker to envd that tracks connections via ConnState callback,
  closes idle connections and disables keep-alives before snapshot, then closes
  all pre-snapshot zombie connections on restore (while preserving post-restore
  connections like the /init request)
- Split envdclient into timeout (2min) and streaming (no timeout) HTTP clients;
  use streaming client for file transfers and process RPCs
- Close host-side idle envdclient connections before PrepareSnapshot so FIN
  packets propagate during the 3s quiesce window
- Add StreamingHTTPClient() accessor; streaming file transfer handlers in
  hostagent use it instead of the timeout client
This commit is contained in:
2026-05-02 05:19:37 +06:00
parent f3572f7356
commit 7ef9a64613
11 changed files with 183 additions and 30 deletions

View File

@ -19,10 +19,11 @@ import (
// Client wraps the Connect RPC client for envd's Process and Filesystem services.
type Client struct {
hostIP string
base string
healthURL string
httpClient *http.Client
hostIP string
base string
healthURL string
httpClient *http.Client
streamingClient *http.Client
process genconnect.ProcessClient
filesystem genconnect.FilesystemClient
@ -32,29 +33,44 @@ type Client struct {
func New(hostIP string) *Client {
base := baseURL(hostIP)
httpClient := newHTTPClient()
streamingClient := newStreamingHTTPClient()
return &Client{
hostIP: hostIP,
base: base,
healthURL: base + "/health",
httpClient: httpClient,
process: genconnect.NewProcessClient(httpClient, base),
filesystem: genconnect.NewFilesystemClient(httpClient, base),
hostIP: hostIP,
base: base,
healthURL: base + "/health",
httpClient: httpClient,
streamingClient: streamingClient,
process: genconnect.NewProcessClient(streamingClient, base),
filesystem: genconnect.NewFilesystemClient(httpClient, base),
}
}
// CloseIdleConnections closes idle connections on both the unary and streaming
// transports. Call this before taking a VM snapshot to remove stale TCP state
// from the guest.
func (c *Client) CloseIdleConnections() {
c.httpClient.CloseIdleConnections()
c.streamingClient.CloseIdleConnections()
}
// BaseURL returns the HTTP base URL for reaching envd.
func (c *Client) BaseURL() string {
return c.base
}
// HTTPClient returns the underlying http.Client used for envd requests.
// Use this instead of http.DefaultClient when making direct HTTP calls to envd
// (e.g. file streaming) to avoid sharing the global transport with proxy traffic.
// HTTPClient returns the http.Client with a 2-minute request timeout.
// Suitable for short-lived envd calls (health, init, snapshot/prepare).
func (c *Client) HTTPClient() *http.Client {
return c.httpClient
}
// StreamingHTTPClient returns the http.Client without a request timeout.
// Use for streaming file transfers or any request that may run indefinitely.
func (c *Client) StreamingHTTPClient() *http.Client {
return c.streamingClient
}
// ExecResult holds the output of a command execution.
type ExecResult struct {
Stdout []byte

View File

@ -20,6 +20,22 @@ func baseURL(hostIP string) string {
// so that proxy traffic to user services inside the sandbox cannot interfere
// with envd RPC connections (PTY streams, exec, file ops).
func newHTTPClient() *http.Client {
return &http.Client{
Timeout: 2 * time.Minute,
Transport: &http.Transport{
MaxIdleConnsPerHost: 10,
IdleConnTimeout: 90 * time.Second,
DialContext: (&net.Dialer{
Timeout: 10 * time.Second,
KeepAlive: 30 * time.Second,
}).DialContext,
},
}
}
// newStreamingHTTPClient returns an http.Client without an overall timeout,
// for long-lived streaming RPCs (PTY, exec stream) that can run indefinitely.
func newStreamingHTTPClient() *http.Client {
return &http.Client{
Transport: &http.Transport{
MaxIdleConnsPerHost: 10,

View File

@ -459,7 +459,7 @@ func (s *Server) WriteFileStream(
}
httpReq.Header.Set("Content-Type", mpWriter.FormDataContentType())
resp, err := client.HTTPClient().Do(httpReq)
resp, err := client.StreamingHTTPClient().Do(httpReq)
if err != nil {
pw.CloseWithError(err)
<-errCh
@ -504,7 +504,7 @@ func (s *Server) ReadFileStream(
return connect.NewError(connect.CodeInternal, fmt.Errorf("create request: %w", err))
}
resp, err := client.HTTPClient().Do(httpReq)
resp, err := client.StreamingHTTPClient().Do(httpReq)
if err != nil {
return connect.NewError(connect.CodeInternal, fmt.Errorf("read file stream: %w", err))
}

View File

@ -387,9 +387,17 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error {
sb.connTracker.Drain(2 * time.Second)
slog.Debug("pause: proxy connections drained", "id", sandboxID)
// Step 0b: Signal envd to quiesce continuous goroutines (port scanner,
// forwarder) and run GC before freezing vCPUs. This prevents Go runtime
// page allocator corruption ("bad summary data") on snapshot restore.
// Step 0b: Close host-side idle connections to envd. Done before
// PrepareSnapshot so FIN packets propagate to the guest during the
// PrepareSnapshot window (no extra sleep needed).
sb.client.CloseIdleConnections()
slog.Debug("pause: envd client idle connections closed", "id", sandboxID)
// Step 0c: Signal envd to quiesce continuous goroutines (port scanner,
// forwarder), close idle HTTP connections, and run GC before freezing
// vCPUs. This prevents Go runtime page allocator corruption ("bad
// summary data") on snapshot restore. The 3s timeout also gives time
// for the FINs from Step 0b to be processed by the guest kernel.
// Best-effort: a failure is logged but does not abort the pause.
func() {
prepCtx, prepCancel := context.WithTimeout(ctx, 3*time.Second)