Add auto-pause TTL and ping endpoint for sandbox inactivity management

Replace the existing auto-destroy TTL behavior with auto-pause: when a
sandbox exceeds its timeout_sec of inactivity, the TTL reaper now pauses
it (snapshot + teardown) instead of destroying it, preserving the ability
to resume later.

Key changes:
- TTL reaper calls Pause instead of Destroy, with fallback to Destroy if
  pause fails (e.g. Firecracker process already gone)
- New PingSandbox RPC resets the in-memory LastActiveAt timer
- New POST /v1/sandboxes/{id}/ping REST endpoint resets both agent memory
  and DB last_active_at
- ListSandboxes RPC now includes auto_paused_sandbox_ids so the reconciler
  can distinguish auto-paused sandboxes from crashed ones in a single call
- Reconciler polls every 5s (was 30s) and marks auto-paused as "paused"
  vs orphaned as "stopped"
- Resume RPC accepts timeout_sec from DB so TTL survives pause/resume cycles
- Reaper checks every 2s (was 10s) and uses a detached context to avoid
  incomplete pauses on app shutdown
- Default timeout_sec changed from 300 to 0 (no auto-pause unless requested)
This commit is contained in:
2026-03-15 05:15:18 +06:00
parent 88246fac2b
commit 477d4f8cf6
13 changed files with 401 additions and 62 deletions

View File

@ -99,9 +99,7 @@ func (h *sandboxHandler) Create(w http.ResponseWriter, r *http.Request) {
if req.MemoryMB <= 0 {
req.MemoryMB = 512
}
if req.TimeoutSec <= 0 {
req.TimeoutSec = 300
}
// timeout_sec = 0 means no auto-pause; only set if explicitly requested.
ctx := r.Context()
ac := auth.MustFromContext(ctx)
@ -259,7 +257,8 @@ func (h *sandboxHandler) Resume(w http.ResponseWriter, r *http.Request) {
}
resp, err := h.agent.ResumeSandbox(ctx, connect.NewRequest(&pb.ResumeSandboxRequest{
SandboxId: sandboxID,
SandboxId: sandboxID,
TimeoutSec: sb.TimeoutSec,
}))
if err != nil {
status, code, msg := agentErrToHTTP(err)
@ -285,6 +284,44 @@ func (h *sandboxHandler) Resume(w http.ResponseWriter, r *http.Request) {
writeJSON(w, http.StatusOK, sandboxToResponse(sb))
}
// Ping handles POST /v1/sandboxes/{id}/ping.
// Resets the inactivity timer for a running sandbox.
func (h *sandboxHandler) Ping(w http.ResponseWriter, r *http.Request) {
sandboxID := chi.URLParam(r, "id")
ctx := r.Context()
ac := auth.MustFromContext(ctx)
sb, err := h.db.GetSandboxByTeam(ctx, db.GetSandboxByTeamParams{ID: sandboxID, TeamID: ac.TeamID})
if err != nil {
writeError(w, http.StatusNotFound, "not_found", "sandbox not found")
return
}
if sb.Status != "running" {
writeError(w, http.StatusConflict, "invalid_state", "sandbox is not running")
return
}
if _, err := h.agent.PingSandbox(ctx, connect.NewRequest(&pb.PingSandboxRequest{
SandboxId: sandboxID,
})); err != nil {
status, code, msg := agentErrToHTTP(err)
writeError(w, status, code, msg)
return
}
if err := h.db.UpdateLastActive(ctx, db.UpdateLastActiveParams{
ID: sandboxID,
LastActiveAt: pgtype.Timestamptz{
Time: time.Now(),
Valid: true,
},
}); err != nil {
slog.Warn("ping: failed to update last_active_at in DB", "sandbox_id", sandboxID, "error", err)
}
w.WriteHeader(http.StatusNoContent)
}
// Destroy handles DELETE /v1/sandboxes/{id}.
func (h *sandboxHandler) Destroy(w http.ResponseWriter, r *http.Request) {
sandboxID := chi.URLParam(r, "id")

View File

@ -175,8 +175,8 @@ const testUIHTML = `<!DOCTYPE html>
<input type="number" id="create-vcpus" value="1" min="1" max="8">
<label>Memory (MB)</label>
<input type="number" id="create-memory" value="512" min="128" max="8192">
<label>Timeout (sec)</label>
<input type="number" id="create-timeout" value="300" min="30">
<label>Timeout (sec, 0 = no auto-pause)</label>
<input type="number" id="create-timeout" value="0" min="0">
<div class="btn-row">
<button class="btn-green" onclick="createSandbox()">Create</button>
</div>
@ -417,7 +417,7 @@ function renderSandboxes(sandboxes) {
document.getElementById('sandboxes-table').innerHTML = '<p style="color:#5f5c57;margin-top:8px">No sandboxes</p>';
return;
}
let html = '<table><thead><tr><th>ID</th><th>Status</th><th>Template</th><th>vCPUs</th><th>Mem</th><th>Host IP</th><th>Created</th><th>Actions</th></tr></thead><tbody>';
let html = '<table><thead><tr><th>ID</th><th>Status</th><th>Template</th><th>vCPUs</th><th>Mem</th><th>TTL</th><th>Host IP</th><th>Created</th><th>Actions</th></tr></thead><tbody>';
for (const sb of sandboxes) {
html += '<tr>';
html += '<td class="clickable" onclick="useSandbox(\'' + sb.id + '\')">' + sb.id + '</td>';
@ -425,10 +425,12 @@ function renderSandboxes(sandboxes) {
html += '<td>' + esc(sb.template) + '</td>';
html += '<td>' + sb.vcpus + '</td>';
html += '<td>' + sb.memory_mb + 'MB</td>';
html += '<td>' + (sb.timeout_sec ? sb.timeout_sec + 's' : '-') + '</td>';
html += '<td>' + (sb.host_ip || '-') + '</td>';
html += '<td>' + new Date(sb.created_at).toLocaleTimeString() + '</td>';
html += '<td><div class="btn-row">';
if (sb.status === 'running') {
html += '<button class="btn-blue" onclick="pingSandbox(\'' + sb.id + '\')">Ping</button>';
html += '<button class="btn-amber" onclick="pauseSandbox(\'' + sb.id + '\')">Pause</button>';
html += '<button class="btn-red" onclick="destroySandbox(\'' + sb.id + '\')">Destroy</button>';
} else if (sb.status === 'paused') {
@ -497,6 +499,16 @@ async function destroySandbox(id) {
}
}
async function pingSandbox(id) {
log('Pinging ' + id + '...', 'info');
try {
await api('POST', '/v1/sandboxes/' + id + '/ping', null, 'apikey');
log('Pinged ' + id + ' — inactivity timer reset', 'ok');
} catch (e) {
log('Ping failed: ' + e.message, 'err');
}
}
// --- Exec ---
async function execCmd() {

View File

@ -249,6 +249,40 @@ paths:
schema:
$ref: "#/components/schemas/Error"
/v1/sandboxes/{id}/ping:
parameters:
- name: id
in: path
required: true
schema:
type: string
post:
summary: Reset sandbox inactivity timer
operationId: pingSandbox
tags: [sandboxes]
security:
- apiKeyAuth: []
description: |
Resets the last_active_at timestamp for a running sandbox, preventing
the auto-pause TTL from expiring. Use this as a keepalive for sandboxes
that are idle but should remain running.
responses:
"204":
description: Ping acknowledged, inactivity timer reset
"404":
description: Sandbox not found
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"409":
description: Sandbox not running
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
/v1/sandboxes/{id}/pause:
parameters:
- name: id
@ -721,7 +755,11 @@ components:
default: 512
timeout_sec:
type: integer
default: 300
default: 0
description: >
Auto-pause TTL in seconds. The sandbox is automatically paused
after this duration of inactivity (no exec or ping). 0 means
no auto-pause.
Sandbox:
type: object

View File

@ -49,7 +49,8 @@ func (rc *Reconciler) Start(ctx context.Context) {
}
func (rc *Reconciler) reconcile(ctx context.Context) {
// Get all sandboxes the host agent knows about.
// Single RPC returns both the running sandbox list and any IDs that
// were auto-paused by the TTL reaper since the last call.
resp, err := rc.agent.ListSandboxes(ctx, connect.NewRequest(&pb.ListSandboxesRequest{}))
if err != nil {
slog.Warn("reconciler: failed to list sandboxes from host agent", "error", err)
@ -62,6 +63,12 @@ func (rc *Reconciler) reconcile(ctx context.Context) {
alive[sb.SandboxId] = struct{}{}
}
// Build auto-paused set from the same response.
autoPausedSet := make(map[string]struct{}, len(resp.Msg.AutoPausedSandboxIds))
for _, id := range resp.Msg.AutoPausedSandboxIds {
autoPausedSet[id] = struct{}{}
}
// Get all DB sandboxes for this host that are running.
// Paused sandboxes are excluded: they are expected to not exist on the
// host agent because pause = snapshot + destroy resources.
@ -86,12 +93,34 @@ func (rc *Reconciler) reconcile(ctx context.Context) {
return
}
slog.Info("reconciler: marking stale sandboxes as stopped", "count", len(stale), "ids", stale)
// Split stale sandboxes into those auto-paused by the TTL reaper vs
// those that crashed/were orphaned.
var toPause, toStop []string
for _, id := range stale {
if _, ok := autoPausedSet[id]; ok {
toPause = append(toPause, id)
} else {
toStop = append(toStop, id)
}
}
if err := rc.db.BulkUpdateStatusByIDs(ctx, db.BulkUpdateStatusByIDsParams{
Column1: stale,
Status: "stopped",
}); err != nil {
slog.Warn("reconciler: failed to update stale sandboxes", "error", err)
if len(toPause) > 0 {
slog.Info("reconciler: marking auto-paused sandboxes", "count", len(toPause), "ids", toPause)
if err := rc.db.BulkUpdateStatusByIDs(ctx, db.BulkUpdateStatusByIDsParams{
Column1: toPause,
Status: "paused",
}); err != nil {
slog.Warn("reconciler: failed to mark auto-paused sandboxes", "error", err)
}
}
if len(toStop) > 0 {
slog.Info("reconciler: marking stale sandboxes as stopped", "count", len(toStop), "ids", toStop)
if err := rc.db.BulkUpdateStatusByIDs(ctx, db.BulkUpdateStatusByIDsParams{
Column1: toStop,
Status: "stopped",
}); err != nil {
slog.Warn("reconciler: failed to update stale sandboxes", "error", err)
}
}
}

View File

@ -64,6 +64,7 @@ func New(queries *db.Queries, agent hostagentv1connect.HostAgentServiceClient, p
r.Delete("/", sandbox.Destroy)
r.Post("/exec", exec.Exec)
r.Get("/exec/stream", execStream.ExecStream)
r.Post("/ping", sandbox.Ping)
r.Post("/pause", sandbox.Pause)
r.Post("/resume", sandbox.Resume)
r.Post("/files/write", files.Upload)