1
0
forked from wrenn/wrenn

fix: close stale TCP connections across snapshot/restore to prevent envd hangs

After Firecracker snapshot restore, zombie TCP sockets from the previous
session cause Go runtime corruption inside the guest VM, making envd
unresponsive. This manifests as infinite loading in the file browser and
terminal timeouts (524) in production (HTTP/2 + Cloudflare) but not locally.

Four-part fix:
- Add ServerConnTracker to envd that tracks connections via ConnState callback,
  closes idle connections and disables keep-alives before snapshot, then closes
  all pre-snapshot zombie connections on restore (while preserving post-restore
  connections like the /init request)
- Split envdclient into timeout (2min) and streaming (no timeout) HTTP clients;
  use streaming client for file transfers and process RPCs
- Close host-side idle envdclient connections before PrepareSnapshot so FIN
  packets propagate during the 3s quiesce window
- Add StreamingHTTPClient() accessor; streaming file transfer handlers in
  hostagent use it instead of the timeout client
This commit is contained in:
2026-05-02 05:19:37 +06:00
parent f3572f7356
commit 7ef9a64613
11 changed files with 183 additions and 30 deletions

View File

@ -0,0 +1,94 @@
package api
import (
"net"
"net/http"
"sync"
)
// ServerConnTracker tracks active HTTP connections via http.Server.ConnState.
// Before a Firecracker snapshot, it closes idle connections, disables
// keep-alives, and records which connections existed pre-snapshot. After
// restore, it closes ALL pre-snapshot connections (they are zombie TCP
// sockets) while leaving post-restore connections (like the /init request)
// untouched.
type ServerConnTracker struct {
mu sync.Mutex
conns map[net.Conn]http.ConnState
preSnapshot map[net.Conn]struct{}
srv *http.Server
}
func NewServerConnTracker() *ServerConnTracker {
return &ServerConnTracker{
conns: make(map[net.Conn]http.ConnState),
}
}
// SetServer stores a reference to the http.Server for keep-alive control.
// Must be called before ListenAndServe.
func (t *ServerConnTracker) SetServer(srv *http.Server) {
t.mu.Lock()
t.srv = srv
t.mu.Unlock()
}
// Track implements the http.Server.ConnState callback signature.
func (t *ServerConnTracker) Track(conn net.Conn, state http.ConnState) {
t.mu.Lock()
defer t.mu.Unlock()
switch state {
case http.StateNew, http.StateActive, http.StateIdle:
t.conns[conn] = state
case http.StateHijacked, http.StateClosed:
delete(t.conns, conn)
delete(t.preSnapshot, conn)
}
}
// PrepareForSnapshot closes idle connections, disables keep-alives, and
// records all remaining active connections. After the response completes
// (with keep-alives disabled, the connection closes), RestoreAfterSnapshot
// will close any that survived into the snapshot as zombie TCP sockets.
//
// GC cycles are handled by PortSubsystem.Stop() which runs before this.
func (t *ServerConnTracker) PrepareForSnapshot() {
t.mu.Lock()
defer t.mu.Unlock()
if t.srv != nil {
t.srv.SetKeepAlivesEnabled(false)
}
t.preSnapshot = make(map[net.Conn]struct{}, len(t.conns))
for conn, state := range t.conns {
if state == http.StateIdle {
conn.Close()
delete(t.conns, conn)
} else {
t.preSnapshot[conn] = struct{}{}
}
}
}
// RestoreAfterSnapshot closes ALL pre-snapshot connections (zombie TCP
// sockets after restore) and re-enables keep-alives. Post-restore
// connections (like the /init request that triggers this call) are not
// in the preSnapshot set and are left untouched.
//
// Safe to call on first boot — preSnapshot is nil, so this is a no-op
// aside from enabling keep-alives (which are already enabled by default).
func (t *ServerConnTracker) RestoreAfterSnapshot() {
t.mu.Lock()
defer t.mu.Unlock()
for conn := range t.preSnapshot {
conn.Close()
delete(t.conns, conn)
}
t.preSnapshot = nil
if t.srv != nil {
t.srv.SetKeepAlivesEnabled(true)
}
}

View File

@ -99,7 +99,7 @@ func TestGetFilesContentDisposition(t *testing.T) {
EnvVars: utils.NewMap[string, string](),
User: currentUser.Username,
}
api := New(&logger, defaults, nil, false, context.Background(), nil, "test")
api := New(&logger, defaults, nil, false, context.Background(), nil, nil, "test")
// Create request and response recorder
req := httptest.NewRequest(http.MethodGet, "/files?path="+url.QueryEscape(tempFile), nil)
@ -148,7 +148,7 @@ func TestGetFilesContentDispositionWithNestedPath(t *testing.T) {
EnvVars: utils.NewMap[string, string](),
User: currentUser.Username,
}
api := New(&logger, defaults, nil, false, context.Background(), nil, "test")
api := New(&logger, defaults, nil, false, context.Background(), nil, nil, "test")
// Create request and response recorder
req := httptest.NewRequest(http.MethodGet, "/files?path="+url.QueryEscape(tempFile), nil)
@ -191,7 +191,7 @@ func TestGetFiles_GzipEncoding_ExplicitIdentityOffWithRange(t *testing.T) {
EnvVars: utils.NewMap[string, string](),
User: currentUser.Username,
}
api := New(&logger, defaults, nil, false, context.Background(), nil, "test")
api := New(&logger, defaults, nil, false, context.Background(), nil, nil, "test")
// Create request and response recorder
req := httptest.NewRequest(http.MethodGet, "/files?path="+url.QueryEscape(tempFile), nil)
@ -232,7 +232,7 @@ func TestGetFiles_GzipDownload(t *testing.T) {
EnvVars: utils.NewMap[string, string](),
User: currentUser.Username,
}
api := New(&logger, defaults, nil, false, context.Background(), nil, "test")
api := New(&logger, defaults, nil, false, context.Background(), nil, nil, "test")
req := httptest.NewRequest(http.MethodGet, "/files?path="+url.QueryEscape(tempFile), nil)
req.Header.Set("Accept-Encoding", "gzip")
@ -297,7 +297,7 @@ func TestPostFiles_GzipUpload(t *testing.T) {
EnvVars: utils.NewMap[string, string](),
User: currentUser.Username,
}
api := New(&logger, defaults, nil, false, context.Background(), nil, "test")
api := New(&logger, defaults, nil, false, context.Background(), nil, nil, "test")
req := httptest.NewRequest(http.MethodPost, "/files?path="+url.QueryEscape(destPath), &gzBuf)
req.Header.Set("Content-Type", mpWriter.FormDataContentType())
@ -357,7 +357,7 @@ func TestGzipUploadThenGzipDownload(t *testing.T) {
EnvVars: utils.NewMap[string, string](),
User: currentUser.Username,
}
api := New(&logger, defaults, nil, false, context.Background(), nil, "test")
api := New(&logger, defaults, nil, false, context.Background(), nil, nil, "test")
uploadReq := httptest.NewRequest(http.MethodPost, "/files?path="+url.QueryEscape(destPath), &gzBuf)
uploadReq.Header.Set("Content-Type", mpWriter.FormDataContentType())

View File

@ -150,6 +150,12 @@ func (a *API) PostInit(w http.ResponseWriter, r *http.Request) {
host.PollForMMDSOpts(ctx, a.mmdsChan, a.defaults.EnvVars)
}()
// Close zombie connections from before the snapshot and re-enable
// keep-alives. On first boot this is a no-op (no zombie connections).
if a.connTracker != nil {
a.connTracker.RestoreAfterSnapshot()
}
// Start the port scanner and forwarder if they were stopped by a
// pre-snapshot prepare call. Start is a no-op if already running,
// so this is safe on first boot and only takes effect after restore.

View File

@ -79,7 +79,7 @@ func newTestAPI(accessToken *SecureToken, mmdsClient MMDSClient) *API {
defaults := &execcontext.Defaults{
EnvVars: utils.NewMap[string, string](),
}
api := New(&logger, defaults, nil, false, context.Background(), nil, "test")
api := New(&logger, defaults, nil, false, context.Background(), nil, nil, "test")
if accessToken != nil {
api.accessToken.TakeFrom(accessToken)
}

View File

@ -7,9 +7,11 @@ import (
"net/http"
)
// PostSnapshotPrepare quiesces continuous goroutines (port scanner, forwarder)
// and forces a GC cycle before Firecracker takes a VM snapshot. This ensures
// the Go runtime's page allocator is in a consistent state when vCPUs are frozen.
// PostSnapshotPrepare quiesces continuous goroutines (port scanner, forwarder),
// closes idle HTTP connections, and forces a GC cycle before Firecracker takes
// a VM snapshot. Closing connections prevents Go runtime corruption from stale
// TCP state after snapshot restore. Keep-alives are disabled so the current
// request's connection also closes after the response.
//
// Called by the host agent as a best-effort signal before vm.Pause().
func (a *API) PostSnapshotPrepare(w http.ResponseWriter, r *http.Request) {
@ -20,6 +22,11 @@ func (a *API) PostSnapshotPrepare(w http.ResponseWriter, r *http.Request) {
a.logger.Info().Msg("snapshot/prepare: port subsystem quiesced")
}
if a.connTracker != nil {
a.connTracker.PrepareForSnapshot()
a.logger.Info().Msg("snapshot/prepare: idle connections closed, keep-alives disabled")
}
w.Header().Set("Cache-Control", "no-store")
w.WriteHeader(http.StatusNoContent)
}

View File

@ -47,9 +47,10 @@ type API struct {
// long-lived goroutines after snapshot restore.
rootCtx context.Context
portSubsystem *publicport.PortSubsystem
connTracker *ServerConnTracker
}
func New(l *zerolog.Logger, defaults *execcontext.Defaults, mmdsChan chan *host.MMDSOpts, isNotFC bool, rootCtx context.Context, portSubsystem *publicport.PortSubsystem, version string) *API {
func New(l *zerolog.Logger, defaults *execcontext.Defaults, mmdsChan chan *host.MMDSOpts, isNotFC bool, rootCtx context.Context, portSubsystem *publicport.PortSubsystem, connTracker *ServerConnTracker, version string) *API {
return &API{
logger: l,
defaults: defaults,
@ -60,6 +61,7 @@ func New(l *zerolog.Logger, defaults *execcontext.Defaults, mmdsChan chan *host.
accessToken: &SecureToken{},
rootCtx: rootCtx,
portSubsystem: portSubsystem,
connTracker: connTracker,
version: version,
}
}