Add auto-pause TTL and ping endpoint for sandbox inactivity management

Replace the existing auto-destroy TTL behavior with auto-pause: when a
sandbox exceeds its timeout_sec of inactivity, the TTL reaper now pauses
it (snapshot + teardown) instead of destroying it, preserving the ability
to resume later.

Key changes:
- TTL reaper calls Pause instead of Destroy, with fallback to Destroy if
  pause fails (e.g. Firecracker process already gone)
- New PingSandbox RPC resets the in-memory LastActiveAt timer
- New POST /v1/sandboxes/{id}/ping REST endpoint resets both agent memory
  and DB last_active_at
- ListSandboxes RPC now includes auto_paused_sandbox_ids so the reconciler
  can distinguish auto-paused sandboxes from crashed ones in a single call
- Reconciler polls every 5s (was 30s) and marks auto-paused as "paused"
  vs orphaned as "stopped"
- Resume RPC accepts timeout_sec from DB so TTL survives pause/resume cycles
- Reaper checks every 2s (was 10s) and uses a detached context to avoid
  incomplete pauses on app shutdown
- Default timeout_sec changed from 300 to 0 (no auto-pause unless requested)
This commit is contained in:
2026-03-15 05:15:18 +06:00
parent 88246fac2b
commit 477d4f8cf6
13 changed files with 401 additions and 62 deletions

View File

@ -99,9 +99,7 @@ func (h *sandboxHandler) Create(w http.ResponseWriter, r *http.Request) {
if req.MemoryMB <= 0 {
req.MemoryMB = 512
}
if req.TimeoutSec <= 0 {
req.TimeoutSec = 300
}
// timeout_sec = 0 means no auto-pause; only set if explicitly requested.
ctx := r.Context()
ac := auth.MustFromContext(ctx)
@ -259,7 +257,8 @@ func (h *sandboxHandler) Resume(w http.ResponseWriter, r *http.Request) {
}
resp, err := h.agent.ResumeSandbox(ctx, connect.NewRequest(&pb.ResumeSandboxRequest{
SandboxId: sandboxID,
SandboxId: sandboxID,
TimeoutSec: sb.TimeoutSec,
}))
if err != nil {
status, code, msg := agentErrToHTTP(err)
@ -285,6 +284,44 @@ func (h *sandboxHandler) Resume(w http.ResponseWriter, r *http.Request) {
writeJSON(w, http.StatusOK, sandboxToResponse(sb))
}
// Ping handles POST /v1/sandboxes/{id}/ping.
// Resets the inactivity timer for a running sandbox.
func (h *sandboxHandler) Ping(w http.ResponseWriter, r *http.Request) {
sandboxID := chi.URLParam(r, "id")
ctx := r.Context()
ac := auth.MustFromContext(ctx)
sb, err := h.db.GetSandboxByTeam(ctx, db.GetSandboxByTeamParams{ID: sandboxID, TeamID: ac.TeamID})
if err != nil {
writeError(w, http.StatusNotFound, "not_found", "sandbox not found")
return
}
if sb.Status != "running" {
writeError(w, http.StatusConflict, "invalid_state", "sandbox is not running")
return
}
if _, err := h.agent.PingSandbox(ctx, connect.NewRequest(&pb.PingSandboxRequest{
SandboxId: sandboxID,
})); err != nil {
status, code, msg := agentErrToHTTP(err)
writeError(w, status, code, msg)
return
}
if err := h.db.UpdateLastActive(ctx, db.UpdateLastActiveParams{
ID: sandboxID,
LastActiveAt: pgtype.Timestamptz{
Time: time.Now(),
Valid: true,
},
}); err != nil {
slog.Warn("ping: failed to update last_active_at in DB", "sandbox_id", sandboxID, "error", err)
}
w.WriteHeader(http.StatusNoContent)
}
// Destroy handles DELETE /v1/sandboxes/{id}.
func (h *sandboxHandler) Destroy(w http.ResponseWriter, r *http.Request) {
sandboxID := chi.URLParam(r, "id")