forked from wrenn/wrenn
fix: close stale TCP connections across snapshot/restore to prevent envd hangs
After Firecracker snapshot restore, zombie TCP sockets from the previous session cause Go runtime corruption inside the guest VM, making envd unresponsive. This manifests as infinite loading in the file browser and terminal timeouts (524) in production (HTTP/2 + Cloudflare) but not locally. Four-part fix: - Add ServerConnTracker to envd that tracks connections via ConnState callback, closes idle connections and disables keep-alives before snapshot, then closes all pre-snapshot zombie connections on restore (while preserving post-restore connections like the /init request) - Split envdclient into timeout (2min) and streaming (no timeout) HTTP clients; use streaming client for file transfers and process RPCs - Close host-side idle envdclient connections before PrepareSnapshot so FIN packets propagate during the 3s quiesce window - Add StreamingHTTPClient() accessor; streaming file transfer handlers in hostagent use it instead of the timeout client
This commit is contained in:
@ -19,10 +19,11 @@ import (
|
||||
|
||||
// Client wraps the Connect RPC client for envd's Process and Filesystem services.
|
||||
type Client struct {
|
||||
hostIP string
|
||||
base string
|
||||
healthURL string
|
||||
httpClient *http.Client
|
||||
hostIP string
|
||||
base string
|
||||
healthURL string
|
||||
httpClient *http.Client
|
||||
streamingClient *http.Client
|
||||
|
||||
process genconnect.ProcessClient
|
||||
filesystem genconnect.FilesystemClient
|
||||
@ -32,29 +33,44 @@ type Client struct {
|
||||
func New(hostIP string) *Client {
|
||||
base := baseURL(hostIP)
|
||||
httpClient := newHTTPClient()
|
||||
streamingClient := newStreamingHTTPClient()
|
||||
|
||||
return &Client{
|
||||
hostIP: hostIP,
|
||||
base: base,
|
||||
healthURL: base + "/health",
|
||||
httpClient: httpClient,
|
||||
process: genconnect.NewProcessClient(httpClient, base),
|
||||
filesystem: genconnect.NewFilesystemClient(httpClient, base),
|
||||
hostIP: hostIP,
|
||||
base: base,
|
||||
healthURL: base + "/health",
|
||||
httpClient: httpClient,
|
||||
streamingClient: streamingClient,
|
||||
process: genconnect.NewProcessClient(streamingClient, base),
|
||||
filesystem: genconnect.NewFilesystemClient(httpClient, base),
|
||||
}
|
||||
}
|
||||
|
||||
// CloseIdleConnections closes idle connections on both the unary and streaming
|
||||
// transports. Call this before taking a VM snapshot to remove stale TCP state
|
||||
// from the guest.
|
||||
func (c *Client) CloseIdleConnections() {
|
||||
c.httpClient.CloseIdleConnections()
|
||||
c.streamingClient.CloseIdleConnections()
|
||||
}
|
||||
|
||||
// BaseURL returns the HTTP base URL for reaching envd.
|
||||
func (c *Client) BaseURL() string {
|
||||
return c.base
|
||||
}
|
||||
|
||||
// HTTPClient returns the underlying http.Client used for envd requests.
|
||||
// Use this instead of http.DefaultClient when making direct HTTP calls to envd
|
||||
// (e.g. file streaming) to avoid sharing the global transport with proxy traffic.
|
||||
// HTTPClient returns the http.Client with a 2-minute request timeout.
|
||||
// Suitable for short-lived envd calls (health, init, snapshot/prepare).
|
||||
func (c *Client) HTTPClient() *http.Client {
|
||||
return c.httpClient
|
||||
}
|
||||
|
||||
// StreamingHTTPClient returns the http.Client without a request timeout.
|
||||
// Use for streaming file transfers or any request that may run indefinitely.
|
||||
func (c *Client) StreamingHTTPClient() *http.Client {
|
||||
return c.streamingClient
|
||||
}
|
||||
|
||||
// ExecResult holds the output of a command execution.
|
||||
type ExecResult struct {
|
||||
Stdout []byte
|
||||
|
||||
Reference in New Issue
Block a user