From bb582deefaddf3fbad09ffde97f9a5e3ddf43f65 Mon Sep 17 00:00:00 2001 From: pptx704 Date: Sat, 2 May 2026 13:48:51 +0600 Subject: [PATCH] fix: prevent sandbox halt after resume by fixing HTTP/2 HOL blocking and adding timeouts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Disable HTTP/2 on both host agent server and CP→agent transport — multiplexing caused head-of-line blocking when a slow sandbox RPC stalled the shared connection. Add ResponseHeaderTimeout to envd HTTP clients. Merge SetDefaults into Resume's PostInit call to eliminate an extra round-trip that could hang on a stale connection. --- cmd/host-agent/main.go | 5 +++++ internal/envdclient/dialer.go | 10 ++++++---- internal/hostagent/server.go | 9 +-------- internal/sandbox/manager.go | 8 +++++--- pkg/lifecycle/hostpool.go | 14 +++++++++++++- 5 files changed, 30 insertions(+), 16 deletions(-) diff --git a/cmd/host-agent/main.go b/cmd/host-agent/main.go index 89d65da..8f3a894 100644 --- a/cmd/host-agent/main.go +++ b/cmd/host-agent/main.go @@ -154,6 +154,11 @@ func main() { Addr: listenAddr, ReadHeaderTimeout: 10 * time.Second, IdleTimeout: 620 * time.Second, // > typical LB upstream timeout (600s) + // Disable HTTP/2: empty non-nil map prevents Go from registering + // the h2 ALPN token. Connect RPC works over HTTP/1.1; HTTP/2 + // multiplexing causes HOL blocking when a slow sandbox RPC stalls + // the shared connection. + TLSNextProto: make(map[string]func(*http.Server, *tls.Conn, http.Handler)), } // mTLS is mandatory — refuse to start without a valid certificate. diff --git a/internal/envdclient/dialer.go b/internal/envdclient/dialer.go index ffd3509..a7dd2a9 100644 --- a/internal/envdclient/dialer.go +++ b/internal/envdclient/dialer.go @@ -23,8 +23,9 @@ func newHTTPClient() *http.Client { return &http.Client{ Timeout: 2 * time.Minute, Transport: &http.Transport{ - MaxIdleConnsPerHost: 10, - IdleConnTimeout: 90 * time.Second, + MaxIdleConnsPerHost: 10, + IdleConnTimeout: 90 * time.Second, + ResponseHeaderTimeout: 30 * time.Second, DialContext: (&net.Dialer{ Timeout: 10 * time.Second, KeepAlive: 30 * time.Second, @@ -38,8 +39,9 @@ func newHTTPClient() *http.Client { func newStreamingHTTPClient() *http.Client { return &http.Client{ Transport: &http.Transport{ - MaxIdleConnsPerHost: 10, - IdleConnTimeout: 90 * time.Second, + MaxIdleConnsPerHost: 10, + IdleConnTimeout: 90 * time.Second, + ResponseHeaderTimeout: 30 * time.Second, DialContext: (&net.Dialer{ Timeout: 10 * time.Second, KeepAlive: 30 * time.Second, diff --git a/internal/hostagent/server.go b/internal/hostagent/server.go index a1b40c8..816a99f 100644 --- a/internal/hostagent/server.go +++ b/internal/hostagent/server.go @@ -109,18 +109,11 @@ func (s *Server) ResumeSandbox( req *connect.Request[pb.ResumeSandboxRequest], ) (*connect.Response[pb.ResumeSandboxResponse], error) { msg := req.Msg - sb, err := s.mgr.Resume(ctx, msg.SandboxId, int(msg.TimeoutSec), msg.KernelVersion) + sb, err := s.mgr.Resume(ctx, msg.SandboxId, int(msg.TimeoutSec), msg.KernelVersion, msg.DefaultUser, msg.DefaultEnv) if err != nil { return nil, connect.NewError(connect.CodeInternal, err) } - // Apply template defaults (user, env vars) if provided. - if msg.DefaultUser != "" || len(msg.DefaultEnv) > 0 { - if err := s.mgr.SetDefaults(ctx, sb.ID, msg.DefaultUser, msg.DefaultEnv); err != nil { - slog.Warn("failed to set sandbox defaults on resume", "sandbox", sb.ID, "error", err) - } - } - return connect.NewResponse(&pb.ResumeSandboxResponse{ SandboxId: sb.ID, Status: string(sb.Status), diff --git a/internal/sandbox/manager.go b/internal/sandbox/manager.go index 117d8c7..82dba06 100644 --- a/internal/sandbox/manager.go +++ b/internal/sandbox/manager.go @@ -626,7 +626,9 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error { // Resume restores a paused sandbox from its snapshot using UFFD for // lazy memory loading. The sandbox gets a new network slot. -func (m *Manager) Resume(ctx context.Context, sandboxID string, timeoutSec int, kernelVersion string) (*models.Sandbox, error) { +// Optional defaultUser and defaultEnv are applied via a single PostInit +// call so that template defaults are set without an extra round-trip. +func (m *Manager) Resume(ctx context.Context, sandboxID string, timeoutSec int, kernelVersion string, defaultUser string, defaultEnv map[string]string) (*models.Sandbox, error) { pauseDir := layout.PauseSnapshotDir(m.cfg.WrennDir, sandboxID) if _, err := os.Stat(pauseDir); err != nil { return nil, fmt.Errorf("no snapshot found for sandbox %s", sandboxID) @@ -783,8 +785,8 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string, timeoutSec int, return nil, fmt.Errorf("wait for envd: %w", err) } - // Trigger envd to re-read MMDS so it picks up the new sandbox/template IDs. - if err := client.PostInit(waitCtx); err != nil { + // Trigger envd to re-read MMDS and apply template defaults in a single call. + if err := client.PostInitWithDefaults(waitCtx, defaultUser, defaultEnv); err != nil { slog.Warn("post-init failed after resume, metadata files may be stale", "sandbox", sandboxID, "error", err) } diff --git a/pkg/lifecycle/hostpool.go b/pkg/lifecycle/hostpool.go index 48ed6c9..508bb52 100644 --- a/pkg/lifecycle/hostpool.go +++ b/pkg/lifecycle/hostpool.go @@ -39,7 +39,19 @@ func NewHostClientPool() *HostClientPool { // (use auth.CPClientTLSConfig to construct it). func NewHostClientPoolTLS(tlsCfg *tls.Config) *HostClientPool { transport := &http.Transport{ - TLSClientConfig: tlsCfg, + TLSClientConfig: tlsCfg, + ForceAttemptHTTP2: false, + // Empty non-nil map disables HTTP/2 ALPN negotiation, forcing HTTP/1.1. + // Connect RPC works over HTTP/1.1; HTTP/2 multiplexing causes HOL + // blocking when a single slow sandbox RPC stalls the shared connection. + TLSNextProto: make(map[string]func(authority string, c *tls.Conn) http.RoundTripper), + MaxIdleConnsPerHost: 20, + IdleConnTimeout: 90 * time.Second, + ResponseHeaderTimeout: 45 * time.Second, + DialContext: (&net.Dialer{ + Timeout: 10 * time.Second, + KeepAlive: 30 * time.Second, + }).DialContext, } return &HostClientPool{ clients: make(map[string]hostagentv1connect.HostAgentServiceClient),