Add auto-pause TTL and ping endpoint for sandbox inactivity management

Replace the existing auto-destroy TTL behavior with auto-pause: when a
sandbox exceeds its timeout_sec of inactivity, the TTL reaper now pauses
it (snapshot + teardown) instead of destroying it, preserving the ability
to resume later.

Key changes:
- TTL reaper calls Pause instead of Destroy, with fallback to Destroy if
  pause fails (e.g. Firecracker process already gone)
- New PingSandbox RPC resets the in-memory LastActiveAt timer
- New POST /v1/sandboxes/{id}/ping REST endpoint resets both agent memory
  and DB last_active_at
- ListSandboxes RPC now includes auto_paused_sandbox_ids so the reconciler
  can distinguish auto-paused sandboxes from crashed ones in a single call
- Reconciler polls every 5s (was 30s) and marks auto-paused as "paused"
  vs orphaned as "stopped"
- Resume RPC accepts timeout_sec from DB so TTL survives pause/resume cycles
- Reaper checks every 2s (was 10s) and uses a detached context to avoid
  incomplete pauses on app shutdown
- Default timeout_sec changed from 300 to 0 (no auto-pause unless requested)
This commit is contained in:
2026-03-15 05:15:18 +06:00
parent 88246fac2b
commit 477d4f8cf6
13 changed files with 401 additions and 62 deletions

View File

@ -8,6 +8,7 @@ import (
"mime/multipart"
"net/http"
"net/url"
"strings"
"time"
"connectrpc.com/connect"
@ -71,7 +72,7 @@ func (s *Server) ResumeSandbox(
ctx context.Context,
req *connect.Request[pb.ResumeSandboxRequest],
) (*connect.Response[pb.ResumeSandboxResponse], error) {
sb, err := s.mgr.Resume(ctx, req.Msg.SandboxId)
sb, err := s.mgr.Resume(ctx, req.Msg.SandboxId, int(req.Msg.TimeoutSec))
if err != nil {
return nil, connect.NewError(connect.CodeInternal, err)
}
@ -106,6 +107,19 @@ func (s *Server) DeleteSnapshot(
return connect.NewResponse(&pb.DeleteSnapshotResponse{}), nil
}
func (s *Server) PingSandbox(
ctx context.Context,
req *connect.Request[pb.PingSandboxRequest],
) (*connect.Response[pb.PingSandboxResponse], error) {
if err := s.mgr.Ping(req.Msg.SandboxId); err != nil {
if strings.Contains(err.Error(), "not found") {
return nil, connect.NewError(connect.CodeNotFound, err)
}
return nil, connect.NewError(connect.CodeFailedPrecondition, err)
}
return connect.NewResponse(&pb.PingSandboxResponse{}), nil
}
func (s *Server) Exec(
ctx context.Context,
req *connect.Request[pb.ExecRequest],
@ -394,6 +408,7 @@ func (s *Server) ListSandboxes(
}
return connect.NewResponse(&pb.ListSandboxesResponse{
Sandboxes: infos,
Sandboxes: infos,
AutoPausedSandboxIds: s.mgr.DrainAutoPausedIDs(),
}), nil
}