forked from wrenn/wrenn
fix: prevent sandbox halt after resume by fixing HTTP/2 HOL blocking and adding timeouts
Disable HTTP/2 on both host agent server and CP→agent transport — multiplexing caused head-of-line blocking when a slow sandbox RPC stalled the shared connection. Add ResponseHeaderTimeout to envd HTTP clients. Merge SetDefaults into Resume's PostInit call to eliminate an extra round-trip that could hang on a stale connection.
This commit is contained in:
@ -23,8 +23,9 @@ func newHTTPClient() *http.Client {
|
||||
return &http.Client{
|
||||
Timeout: 2 * time.Minute,
|
||||
Transport: &http.Transport{
|
||||
MaxIdleConnsPerHost: 10,
|
||||
IdleConnTimeout: 90 * time.Second,
|
||||
MaxIdleConnsPerHost: 10,
|
||||
IdleConnTimeout: 90 * time.Second,
|
||||
ResponseHeaderTimeout: 30 * time.Second,
|
||||
DialContext: (&net.Dialer{
|
||||
Timeout: 10 * time.Second,
|
||||
KeepAlive: 30 * time.Second,
|
||||
@ -38,8 +39,9 @@ func newHTTPClient() *http.Client {
|
||||
func newStreamingHTTPClient() *http.Client {
|
||||
return &http.Client{
|
||||
Transport: &http.Transport{
|
||||
MaxIdleConnsPerHost: 10,
|
||||
IdleConnTimeout: 90 * time.Second,
|
||||
MaxIdleConnsPerHost: 10,
|
||||
IdleConnTimeout: 90 * time.Second,
|
||||
ResponseHeaderTimeout: 30 * time.Second,
|
||||
DialContext: (&net.Dialer{
|
||||
Timeout: 10 * time.Second,
|
||||
KeepAlive: 30 * time.Second,
|
||||
|
||||
@ -109,18 +109,11 @@ func (s *Server) ResumeSandbox(
|
||||
req *connect.Request[pb.ResumeSandboxRequest],
|
||||
) (*connect.Response[pb.ResumeSandboxResponse], error) {
|
||||
msg := req.Msg
|
||||
sb, err := s.mgr.Resume(ctx, msg.SandboxId, int(msg.TimeoutSec), msg.KernelVersion)
|
||||
sb, err := s.mgr.Resume(ctx, msg.SandboxId, int(msg.TimeoutSec), msg.KernelVersion, msg.DefaultUser, msg.DefaultEnv)
|
||||
if err != nil {
|
||||
return nil, connect.NewError(connect.CodeInternal, err)
|
||||
}
|
||||
|
||||
// Apply template defaults (user, env vars) if provided.
|
||||
if msg.DefaultUser != "" || len(msg.DefaultEnv) > 0 {
|
||||
if err := s.mgr.SetDefaults(ctx, sb.ID, msg.DefaultUser, msg.DefaultEnv); err != nil {
|
||||
slog.Warn("failed to set sandbox defaults on resume", "sandbox", sb.ID, "error", err)
|
||||
}
|
||||
}
|
||||
|
||||
return connect.NewResponse(&pb.ResumeSandboxResponse{
|
||||
SandboxId: sb.ID,
|
||||
Status: string(sb.Status),
|
||||
|
||||
@ -626,7 +626,9 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error {
|
||||
|
||||
// Resume restores a paused sandbox from its snapshot using UFFD for
|
||||
// lazy memory loading. The sandbox gets a new network slot.
|
||||
func (m *Manager) Resume(ctx context.Context, sandboxID string, timeoutSec int, kernelVersion string) (*models.Sandbox, error) {
|
||||
// Optional defaultUser and defaultEnv are applied via a single PostInit
|
||||
// call so that template defaults are set without an extra round-trip.
|
||||
func (m *Manager) Resume(ctx context.Context, sandboxID string, timeoutSec int, kernelVersion string, defaultUser string, defaultEnv map[string]string) (*models.Sandbox, error) {
|
||||
pauseDir := layout.PauseSnapshotDir(m.cfg.WrennDir, sandboxID)
|
||||
if _, err := os.Stat(pauseDir); err != nil {
|
||||
return nil, fmt.Errorf("no snapshot found for sandbox %s", sandboxID)
|
||||
@ -783,8 +785,8 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string, timeoutSec int,
|
||||
return nil, fmt.Errorf("wait for envd: %w", err)
|
||||
}
|
||||
|
||||
// Trigger envd to re-read MMDS so it picks up the new sandbox/template IDs.
|
||||
if err := client.PostInit(waitCtx); err != nil {
|
||||
// Trigger envd to re-read MMDS and apply template defaults in a single call.
|
||||
if err := client.PostInitWithDefaults(waitCtx, defaultUser, defaultEnv); err != nil {
|
||||
slog.Warn("post-init failed after resume, metadata files may be stale", "sandbox", sandboxID, "error", err)
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user