1
0
forked from wrenn/wrenn
Files
wrenn-releases/internal/envdclient/dialer.go
pptx704 7ef9a64613 fix: close stale TCP connections across snapshot/restore to prevent envd hangs
After Firecracker snapshot restore, zombie TCP sockets from the previous
session cause Go runtime corruption inside the guest VM, making envd
unresponsive. This manifests as infinite loading in the file browser and
terminal timeouts (524) in production (HTTP/2 + Cloudflare) but not locally.

Four-part fix:
- Add ServerConnTracker to envd that tracks connections via ConnState callback,
  closes idle connections and disables keep-alives before snapshot, then closes
  all pre-snapshot zombie connections on restore (while preserving post-restore
  connections like the /init request)
- Split envdclient into timeout (2min) and streaming (no timeout) HTTP clients;
  use streaming client for file transfers and process RPCs
- Close host-side idle envdclient connections before PrepareSnapshot so FIN
  packets propagate during the 3s quiesce window
- Add StreamingHTTPClient() accessor; streaming file transfer handlers in
  hostagent use it instead of the timeout client
2026-05-02 05:19:37 +06:00

50 lines
1.4 KiB
Go

package envdclient
import (
"fmt"
"net"
"net/http"
"time"
)
// envdPort is the default port envd listens on inside the guest.
const envdPort = 49983
// baseURL returns the HTTP base URL for reaching envd at the given host IP.
func baseURL(hostIP string) string {
return fmt.Sprintf("http://%s:%d", hostIP, envdPort)
}
// newHTTPClient returns an http.Client with a dedicated transport for talking
// to envd. The transport is intentionally separate from http.DefaultTransport
// so that proxy traffic to user services inside the sandbox cannot interfere
// with envd RPC connections (PTY streams, exec, file ops).
func newHTTPClient() *http.Client {
return &http.Client{
Timeout: 2 * time.Minute,
Transport: &http.Transport{
MaxIdleConnsPerHost: 10,
IdleConnTimeout: 90 * time.Second,
DialContext: (&net.Dialer{
Timeout: 10 * time.Second,
KeepAlive: 30 * time.Second,
}).DialContext,
},
}
}
// newStreamingHTTPClient returns an http.Client without an overall timeout,
// for long-lived streaming RPCs (PTY, exec stream) that can run indefinitely.
func newStreamingHTTPClient() *http.Client {
return &http.Client{
Transport: &http.Transport{
MaxIdleConnsPerHost: 10,
IdleConnTimeout: 90 * time.Second,
DialContext: (&net.Dialer{
Timeout: 10 * time.Second,
KeepAlive: 30 * time.Second,
}).DialContext,
},
}
}