Add auto-pause TTL and ping endpoint for sandbox inactivity management

Replace the existing auto-destroy TTL behavior with auto-pause: when a sandbox exceeds its timeout_sec of inactivity, the TTL reaper now pauses it (snapshot + teardown) instead of destroying it, preserving the ability to resume later. Key changes: - TTL reaper calls Pause instead of Destroy, with fallback to Destroy if pause fails (e.g. Firecracker process already gone) - New PingSandbox RPC resets the in-memory LastActiveAt timer - New POST /v1/sandboxes/{id}/ping REST endpoint resets both agent memory and DB last_active_at - ListSandboxes RPC now includes auto_paused_sandbox_ids so the reconciler can distinguish auto-paused sandboxes from crashed ones in a single call - Reconciler polls every 5s (was 30s) and marks auto-paused as "paused" vs orphaned as "stopped" - Resume RPC accepts timeout_sec from DB so TTL survives pause/resume cycles - Reaper checks every 2s (was 10s) and uses a detached context to avoid incomplete pauses on app shutdown - Default timeout_sec changed from 300 to 0 (no auto-pause unless requested)
2026-03-15 05:15:18 +06:00
parent 88246fac2b
commit 477d4f8cf6
13 changed files with 401 additions and 62 deletions
--- a/internal/api/handlers_sandbox.go
+++ b/internal/api/handlers_sandbox.go
@ -99,9 +99,7 @@ func (h *sandboxHandler) Create(w http.ResponseWriter, r *http.Request) {
 	if req.MemoryMB <= 0 {
 		req.MemoryMB = 512
 	}
-	if req.TimeoutSec <= 0 {
-		req.TimeoutSec = 300
-	}
+	// timeout_sec = 0 means no auto-pause; only set if explicitly requested.

 	ctx := r.Context()
 	ac := auth.MustFromContext(ctx)
@ -259,7 +257,8 @@ func (h *sandboxHandler) Resume(w http.ResponseWriter, r *http.Request) {
 	}

 	resp, err := h.agent.ResumeSandbox(ctx, connect.NewRequest(&pb.ResumeSandboxRequest{
-		SandboxId: sandboxID,
+		SandboxId:  sandboxID,
+		TimeoutSec: sb.TimeoutSec,
 	}))
 	if err != nil {
 		status, code, msg := agentErrToHTTP(err)
@ -285,6 +284,44 @@ func (h *sandboxHandler) Resume(w http.ResponseWriter, r *http.Request) {
 	writeJSON(w, http.StatusOK, sandboxToResponse(sb))
 }

+// Ping handles POST /v1/sandboxes/{id}/ping.
+// Resets the inactivity timer for a running sandbox.
+func (h *sandboxHandler) Ping(w http.ResponseWriter, r *http.Request) {
+	sandboxID := chi.URLParam(r, "id")
+	ctx := r.Context()
+	ac := auth.MustFromContext(ctx)
+
+	sb, err := h.db.GetSandboxByTeam(ctx, db.GetSandboxByTeamParams{ID: sandboxID, TeamID: ac.TeamID})
+	if err != nil {
+		writeError(w, http.StatusNotFound, "not_found", "sandbox not found")
+		return
+	}
+	if sb.Status != "running" {
+		writeError(w, http.StatusConflict, "invalid_state", "sandbox is not running")
+		return
+	}
+
+	if _, err := h.agent.PingSandbox(ctx, connect.NewRequest(&pb.PingSandboxRequest{
+		SandboxId: sandboxID,
+	})); err != nil {
+		status, code, msg := agentErrToHTTP(err)
+		writeError(w, status, code, msg)
+		return
+	}
+
+	if err := h.db.UpdateLastActive(ctx, db.UpdateLastActiveParams{
+		ID: sandboxID,
+		LastActiveAt: pgtype.Timestamptz{
+			Time:  time.Now(),
+			Valid: true,
+		},
+	}); err != nil {
+		slog.Warn("ping: failed to update last_active_at in DB", "sandbox_id", sandboxID, "error", err)
+	}
+
+	w.WriteHeader(http.StatusNoContent)
+}
+
 // Destroy handles DELETE /v1/sandboxes/{id}.
 func (h *sandboxHandler) Destroy(w http.ResponseWriter, r *http.Request) {
 	sandboxID := chi.URLParam(r, "id")