Add auto-pause TTL and ping endpoint for sandbox inactivity management

Replace the existing auto-destroy TTL behavior with auto-pause: when a
sandbox exceeds its timeout_sec of inactivity, the TTL reaper now pauses
it (snapshot + teardown) instead of destroying it, preserving the ability
to resume later.

Key changes:
- TTL reaper calls Pause instead of Destroy, with fallback to Destroy if
  pause fails (e.g. Firecracker process already gone)
- New PingSandbox RPC resets the in-memory LastActiveAt timer
- New POST /v1/sandboxes/{id}/ping REST endpoint resets both agent memory
  and DB last_active_at
- ListSandboxes RPC now includes auto_paused_sandbox_ids so the reconciler
  can distinguish auto-paused sandboxes from crashed ones in a single call
- Reconciler polls every 5s (was 30s) and marks auto-paused as "paused"
  vs orphaned as "stopped"
- Resume RPC accepts timeout_sec from DB so TTL survives pause/resume cycles
- Reaper checks every 2s (was 10s) and uses a detached context to avoid
  incomplete pauses on app shutdown
- Default timeout_sec changed from 300 to 0 (no auto-pause unless requested)
This commit is contained in:
2026-03-15 05:15:18 +06:00
parent 88246fac2b
commit 477d4f8cf6
13 changed files with 401 additions and 62 deletions

View File

@ -49,7 +49,8 @@ func (rc *Reconciler) Start(ctx context.Context) {
}
func (rc *Reconciler) reconcile(ctx context.Context) {
// Get all sandboxes the host agent knows about.
// Single RPC returns both the running sandbox list and any IDs that
// were auto-paused by the TTL reaper since the last call.
resp, err := rc.agent.ListSandboxes(ctx, connect.NewRequest(&pb.ListSandboxesRequest{}))
if err != nil {
slog.Warn("reconciler: failed to list sandboxes from host agent", "error", err)
@ -62,6 +63,12 @@ func (rc *Reconciler) reconcile(ctx context.Context) {
alive[sb.SandboxId] = struct{}{}
}
// Build auto-paused set from the same response.
autoPausedSet := make(map[string]struct{}, len(resp.Msg.AutoPausedSandboxIds))
for _, id := range resp.Msg.AutoPausedSandboxIds {
autoPausedSet[id] = struct{}{}
}
// Get all DB sandboxes for this host that are running.
// Paused sandboxes are excluded: they are expected to not exist on the
// host agent because pause = snapshot + destroy resources.
@ -86,12 +93,34 @@ func (rc *Reconciler) reconcile(ctx context.Context) {
return
}
slog.Info("reconciler: marking stale sandboxes as stopped", "count", len(stale), "ids", stale)
// Split stale sandboxes into those auto-paused by the TTL reaper vs
// those that crashed/were orphaned.
var toPause, toStop []string
for _, id := range stale {
if _, ok := autoPausedSet[id]; ok {
toPause = append(toPause, id)
} else {
toStop = append(toStop, id)
}
}
if err := rc.db.BulkUpdateStatusByIDs(ctx, db.BulkUpdateStatusByIDsParams{
Column1: stale,
Status: "stopped",
}); err != nil {
slog.Warn("reconciler: failed to update stale sandboxes", "error", err)
if len(toPause) > 0 {
slog.Info("reconciler: marking auto-paused sandboxes", "count", len(toPause), "ids", toPause)
if err := rc.db.BulkUpdateStatusByIDs(ctx, db.BulkUpdateStatusByIDsParams{
Column1: toPause,
Status: "paused",
}); err != nil {
slog.Warn("reconciler: failed to mark auto-paused sandboxes", "error", err)
}
}
if len(toStop) > 0 {
slog.Info("reconciler: marking stale sandboxes as stopped", "count", len(toStop), "ids", toStop)
if err := rc.db.BulkUpdateStatusByIDs(ctx, db.BulkUpdateStatusByIDsParams{
Column1: toStop,
Status: "stopped",
}); err != nil {
slog.Warn("reconciler: failed to update stale sandboxes", "error", err)
}
}
}