Add auto-pause TTL and ping endpoint for sandbox inactivity management
Replace the existing auto-destroy TTL behavior with auto-pause: when a
sandbox exceeds its timeout_sec of inactivity, the TTL reaper now pauses
it (snapshot + teardown) instead of destroying it, preserving the ability
to resume later.
Key changes:
- TTL reaper calls Pause instead of Destroy, with fallback to Destroy if
pause fails (e.g. Firecracker process already gone)
- New PingSandbox RPC resets the in-memory LastActiveAt timer
- New POST /v1/sandboxes/{id}/ping REST endpoint resets both agent memory
and DB last_active_at
- ListSandboxes RPC now includes auto_paused_sandbox_ids so the reconciler
can distinguish auto-paused sandboxes from crashed ones in a single call
- Reconciler polls every 5s (was 30s) and marks auto-paused as "paused"
vs orphaned as "stopped"
- Resume RPC accepts timeout_sec from DB so TTL survives pause/resume cycles
- Reaper checks every 2s (was 10s) and uses a detached context to avoid
incomplete pauses on app shutdown
- Default timeout_sec changed from 300 to 0 (no auto-pause unless requested)
This commit is contained in:
@ -99,9 +99,7 @@ func (h *sandboxHandler) Create(w http.ResponseWriter, r *http.Request) {
|
||||
if req.MemoryMB <= 0 {
|
||||
req.MemoryMB = 512
|
||||
}
|
||||
if req.TimeoutSec <= 0 {
|
||||
req.TimeoutSec = 300
|
||||
}
|
||||
// timeout_sec = 0 means no auto-pause; only set if explicitly requested.
|
||||
|
||||
ctx := r.Context()
|
||||
ac := auth.MustFromContext(ctx)
|
||||
@ -259,7 +257,8 @@ func (h *sandboxHandler) Resume(w http.ResponseWriter, r *http.Request) {
|
||||
}
|
||||
|
||||
resp, err := h.agent.ResumeSandbox(ctx, connect.NewRequest(&pb.ResumeSandboxRequest{
|
||||
SandboxId: sandboxID,
|
||||
SandboxId: sandboxID,
|
||||
TimeoutSec: sb.TimeoutSec,
|
||||
}))
|
||||
if err != nil {
|
||||
status, code, msg := agentErrToHTTP(err)
|
||||
@ -285,6 +284,44 @@ func (h *sandboxHandler) Resume(w http.ResponseWriter, r *http.Request) {
|
||||
writeJSON(w, http.StatusOK, sandboxToResponse(sb))
|
||||
}
|
||||
|
||||
// Ping handles POST /v1/sandboxes/{id}/ping.
|
||||
// Resets the inactivity timer for a running sandbox.
|
||||
func (h *sandboxHandler) Ping(w http.ResponseWriter, r *http.Request) {
|
||||
sandboxID := chi.URLParam(r, "id")
|
||||
ctx := r.Context()
|
||||
ac := auth.MustFromContext(ctx)
|
||||
|
||||
sb, err := h.db.GetSandboxByTeam(ctx, db.GetSandboxByTeamParams{ID: sandboxID, TeamID: ac.TeamID})
|
||||
if err != nil {
|
||||
writeError(w, http.StatusNotFound, "not_found", "sandbox not found")
|
||||
return
|
||||
}
|
||||
if sb.Status != "running" {
|
||||
writeError(w, http.StatusConflict, "invalid_state", "sandbox is not running")
|
||||
return
|
||||
}
|
||||
|
||||
if _, err := h.agent.PingSandbox(ctx, connect.NewRequest(&pb.PingSandboxRequest{
|
||||
SandboxId: sandboxID,
|
||||
})); err != nil {
|
||||
status, code, msg := agentErrToHTTP(err)
|
||||
writeError(w, status, code, msg)
|
||||
return
|
||||
}
|
||||
|
||||
if err := h.db.UpdateLastActive(ctx, db.UpdateLastActiveParams{
|
||||
ID: sandboxID,
|
||||
LastActiveAt: pgtype.Timestamptz{
|
||||
Time: time.Now(),
|
||||
Valid: true,
|
||||
},
|
||||
}); err != nil {
|
||||
slog.Warn("ping: failed to update last_active_at in DB", "sandbox_id", sandboxID, "error", err)
|
||||
}
|
||||
|
||||
w.WriteHeader(http.StatusNoContent)
|
||||
}
|
||||
|
||||
// Destroy handles DELETE /v1/sandboxes/{id}.
|
||||
func (h *sandboxHandler) Destroy(w http.ResponseWriter, r *http.Request) {
|
||||
sandboxID := chi.URLParam(r, "id")
|
||||
|
||||
@ -175,8 +175,8 @@ const testUIHTML = `<!DOCTYPE html>
|
||||
<input type="number" id="create-vcpus" value="1" min="1" max="8">
|
||||
<label>Memory (MB)</label>
|
||||
<input type="number" id="create-memory" value="512" min="128" max="8192">
|
||||
<label>Timeout (sec)</label>
|
||||
<input type="number" id="create-timeout" value="300" min="30">
|
||||
<label>Timeout (sec, 0 = no auto-pause)</label>
|
||||
<input type="number" id="create-timeout" value="0" min="0">
|
||||
<div class="btn-row">
|
||||
<button class="btn-green" onclick="createSandbox()">Create</button>
|
||||
</div>
|
||||
@ -417,7 +417,7 @@ function renderSandboxes(sandboxes) {
|
||||
document.getElementById('sandboxes-table').innerHTML = '<p style="color:#5f5c57;margin-top:8px">No sandboxes</p>';
|
||||
return;
|
||||
}
|
||||
let html = '<table><thead><tr><th>ID</th><th>Status</th><th>Template</th><th>vCPUs</th><th>Mem</th><th>Host IP</th><th>Created</th><th>Actions</th></tr></thead><tbody>';
|
||||
let html = '<table><thead><tr><th>ID</th><th>Status</th><th>Template</th><th>vCPUs</th><th>Mem</th><th>TTL</th><th>Host IP</th><th>Created</th><th>Actions</th></tr></thead><tbody>';
|
||||
for (const sb of sandboxes) {
|
||||
html += '<tr>';
|
||||
html += '<td class="clickable" onclick="useSandbox(\'' + sb.id + '\')">' + sb.id + '</td>';
|
||||
@ -425,10 +425,12 @@ function renderSandboxes(sandboxes) {
|
||||
html += '<td>' + esc(sb.template) + '</td>';
|
||||
html += '<td>' + sb.vcpus + '</td>';
|
||||
html += '<td>' + sb.memory_mb + 'MB</td>';
|
||||
html += '<td>' + (sb.timeout_sec ? sb.timeout_sec + 's' : '-') + '</td>';
|
||||
html += '<td>' + (sb.host_ip || '-') + '</td>';
|
||||
html += '<td>' + new Date(sb.created_at).toLocaleTimeString() + '</td>';
|
||||
html += '<td><div class="btn-row">';
|
||||
if (sb.status === 'running') {
|
||||
html += '<button class="btn-blue" onclick="pingSandbox(\'' + sb.id + '\')">Ping</button>';
|
||||
html += '<button class="btn-amber" onclick="pauseSandbox(\'' + sb.id + '\')">Pause</button>';
|
||||
html += '<button class="btn-red" onclick="destroySandbox(\'' + sb.id + '\')">Destroy</button>';
|
||||
} else if (sb.status === 'paused') {
|
||||
@ -497,6 +499,16 @@ async function destroySandbox(id) {
|
||||
}
|
||||
}
|
||||
|
||||
async function pingSandbox(id) {
|
||||
log('Pinging ' + id + '...', 'info');
|
||||
try {
|
||||
await api('POST', '/v1/sandboxes/' + id + '/ping', null, 'apikey');
|
||||
log('Pinged ' + id + ' — inactivity timer reset', 'ok');
|
||||
} catch (e) {
|
||||
log('Ping failed: ' + e.message, 'err');
|
||||
}
|
||||
}
|
||||
|
||||
// --- Exec ---
|
||||
|
||||
async function execCmd() {
|
||||
|
||||
@ -249,6 +249,40 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
|
||||
/v1/sandboxes/{id}/ping:
|
||||
parameters:
|
||||
- name: id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
|
||||
post:
|
||||
summary: Reset sandbox inactivity timer
|
||||
operationId: pingSandbox
|
||||
tags: [sandboxes]
|
||||
security:
|
||||
- apiKeyAuth: []
|
||||
description: |
|
||||
Resets the last_active_at timestamp for a running sandbox, preventing
|
||||
the auto-pause TTL from expiring. Use this as a keepalive for sandboxes
|
||||
that are idle but should remain running.
|
||||
responses:
|
||||
"204":
|
||||
description: Ping acknowledged, inactivity timer reset
|
||||
"404":
|
||||
description: Sandbox not found
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"409":
|
||||
description: Sandbox not running
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
|
||||
/v1/sandboxes/{id}/pause:
|
||||
parameters:
|
||||
- name: id
|
||||
@ -721,7 +755,11 @@ components:
|
||||
default: 512
|
||||
timeout_sec:
|
||||
type: integer
|
||||
default: 300
|
||||
default: 0
|
||||
description: >
|
||||
Auto-pause TTL in seconds. The sandbox is automatically paused
|
||||
after this duration of inactivity (no exec or ping). 0 means
|
||||
no auto-pause.
|
||||
|
||||
Sandbox:
|
||||
type: object
|
||||
|
||||
@ -49,7 +49,8 @@ func (rc *Reconciler) Start(ctx context.Context) {
|
||||
}
|
||||
|
||||
func (rc *Reconciler) reconcile(ctx context.Context) {
|
||||
// Get all sandboxes the host agent knows about.
|
||||
// Single RPC returns both the running sandbox list and any IDs that
|
||||
// were auto-paused by the TTL reaper since the last call.
|
||||
resp, err := rc.agent.ListSandboxes(ctx, connect.NewRequest(&pb.ListSandboxesRequest{}))
|
||||
if err != nil {
|
||||
slog.Warn("reconciler: failed to list sandboxes from host agent", "error", err)
|
||||
@ -62,6 +63,12 @@ func (rc *Reconciler) reconcile(ctx context.Context) {
|
||||
alive[sb.SandboxId] = struct{}{}
|
||||
}
|
||||
|
||||
// Build auto-paused set from the same response.
|
||||
autoPausedSet := make(map[string]struct{}, len(resp.Msg.AutoPausedSandboxIds))
|
||||
for _, id := range resp.Msg.AutoPausedSandboxIds {
|
||||
autoPausedSet[id] = struct{}{}
|
||||
}
|
||||
|
||||
// Get all DB sandboxes for this host that are running.
|
||||
// Paused sandboxes are excluded: they are expected to not exist on the
|
||||
// host agent because pause = snapshot + destroy resources.
|
||||
@ -86,12 +93,34 @@ func (rc *Reconciler) reconcile(ctx context.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
slog.Info("reconciler: marking stale sandboxes as stopped", "count", len(stale), "ids", stale)
|
||||
// Split stale sandboxes into those auto-paused by the TTL reaper vs
|
||||
// those that crashed/were orphaned.
|
||||
var toPause, toStop []string
|
||||
for _, id := range stale {
|
||||
if _, ok := autoPausedSet[id]; ok {
|
||||
toPause = append(toPause, id)
|
||||
} else {
|
||||
toStop = append(toStop, id)
|
||||
}
|
||||
}
|
||||
|
||||
if err := rc.db.BulkUpdateStatusByIDs(ctx, db.BulkUpdateStatusByIDsParams{
|
||||
Column1: stale,
|
||||
Status: "stopped",
|
||||
}); err != nil {
|
||||
slog.Warn("reconciler: failed to update stale sandboxes", "error", err)
|
||||
if len(toPause) > 0 {
|
||||
slog.Info("reconciler: marking auto-paused sandboxes", "count", len(toPause), "ids", toPause)
|
||||
if err := rc.db.BulkUpdateStatusByIDs(ctx, db.BulkUpdateStatusByIDsParams{
|
||||
Column1: toPause,
|
||||
Status: "paused",
|
||||
}); err != nil {
|
||||
slog.Warn("reconciler: failed to mark auto-paused sandboxes", "error", err)
|
||||
}
|
||||
}
|
||||
|
||||
if len(toStop) > 0 {
|
||||
slog.Info("reconciler: marking stale sandboxes as stopped", "count", len(toStop), "ids", toStop)
|
||||
if err := rc.db.BulkUpdateStatusByIDs(ctx, db.BulkUpdateStatusByIDsParams{
|
||||
Column1: toStop,
|
||||
Status: "stopped",
|
||||
}); err != nil {
|
||||
slog.Warn("reconciler: failed to update stale sandboxes", "error", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -64,6 +64,7 @@ func New(queries *db.Queries, agent hostagentv1connect.HostAgentServiceClient, p
|
||||
r.Delete("/", sandbox.Destroy)
|
||||
r.Post("/exec", exec.Exec)
|
||||
r.Get("/exec/stream", execStream.ExecStream)
|
||||
r.Post("/ping", sandbox.Ping)
|
||||
r.Post("/pause", sandbox.Pause)
|
||||
r.Post("/resume", sandbox.Resume)
|
||||
r.Post("/files/write", files.Upload)
|
||||
|
||||
@ -8,6 +8,7 @@ import (
|
||||
"mime/multipart"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"connectrpc.com/connect"
|
||||
@ -71,7 +72,7 @@ func (s *Server) ResumeSandbox(
|
||||
ctx context.Context,
|
||||
req *connect.Request[pb.ResumeSandboxRequest],
|
||||
) (*connect.Response[pb.ResumeSandboxResponse], error) {
|
||||
sb, err := s.mgr.Resume(ctx, req.Msg.SandboxId)
|
||||
sb, err := s.mgr.Resume(ctx, req.Msg.SandboxId, int(req.Msg.TimeoutSec))
|
||||
if err != nil {
|
||||
return nil, connect.NewError(connect.CodeInternal, err)
|
||||
}
|
||||
@ -106,6 +107,19 @@ func (s *Server) DeleteSnapshot(
|
||||
return connect.NewResponse(&pb.DeleteSnapshotResponse{}), nil
|
||||
}
|
||||
|
||||
func (s *Server) PingSandbox(
|
||||
ctx context.Context,
|
||||
req *connect.Request[pb.PingSandboxRequest],
|
||||
) (*connect.Response[pb.PingSandboxResponse], error) {
|
||||
if err := s.mgr.Ping(req.Msg.SandboxId); err != nil {
|
||||
if strings.Contains(err.Error(), "not found") {
|
||||
return nil, connect.NewError(connect.CodeNotFound, err)
|
||||
}
|
||||
return nil, connect.NewError(connect.CodeFailedPrecondition, err)
|
||||
}
|
||||
return connect.NewResponse(&pb.PingSandboxResponse{}), nil
|
||||
}
|
||||
|
||||
func (s *Server) Exec(
|
||||
ctx context.Context,
|
||||
req *connect.Request[pb.ExecRequest],
|
||||
@ -394,6 +408,7 @@ func (s *Server) ListSandboxes(
|
||||
}
|
||||
|
||||
return connect.NewResponse(&pb.ListSandboxesResponse{
|
||||
Sandboxes: infos,
|
||||
Sandboxes: infos,
|
||||
AutoPausedSandboxIds: s.mgr.DrainAutoPausedIDs(),
|
||||
}), nil
|
||||
}
|
||||
|
||||
@ -40,6 +40,9 @@ type Manager struct {
|
||||
mu sync.RWMutex
|
||||
boxes map[string]*sandboxState
|
||||
stopCh chan struct{}
|
||||
|
||||
autoPausedMu sync.Mutex
|
||||
autoPausedIDs []string
|
||||
}
|
||||
|
||||
// sandboxState holds the runtime state for a single sandbox.
|
||||
@ -459,7 +462,7 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error {
|
||||
|
||||
// Resume restores a paused sandbox from its snapshot using UFFD for
|
||||
// lazy memory loading. The sandbox gets a new network slot.
|
||||
func (m *Manager) Resume(ctx context.Context, sandboxID string) (*models.Sandbox, error) {
|
||||
func (m *Manager) Resume(ctx context.Context, sandboxID string, timeoutSec int) (*models.Sandbox, error) {
|
||||
snapDir := m.cfg.SnapshotsDir
|
||||
if !snapshot.Exists(snapDir, sandboxID) {
|
||||
return nil, fmt.Errorf("no snapshot found for sandbox %s", sandboxID)
|
||||
@ -575,7 +578,7 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string) (*models.Sandbox
|
||||
SandboxID: sandboxID,
|
||||
KernelPath: m.cfg.KernelPath,
|
||||
RootfsPath: dmDev.DevicePath,
|
||||
VCPUs: 1, // Placeholder; overridden by snapshot.
|
||||
VCPUs: 1, // Placeholder; overridden by snapshot.
|
||||
MemoryMB: int(header.Metadata.Size / (1024 * 1024)), // Placeholder; overridden by snapshot.
|
||||
NetworkNamespace: slot.NamespaceID,
|
||||
TapDevice: slot.TapName,
|
||||
@ -622,7 +625,7 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string) (*models.Sandbox
|
||||
Template: "",
|
||||
VCPUs: vmCfg.VCPUs,
|
||||
MemoryMB: vmCfg.MemoryMB,
|
||||
TimeoutSec: 0,
|
||||
TimeoutSec: timeoutSec,
|
||||
SlotIndex: slotIdx,
|
||||
HostIP: slot.HostIP,
|
||||
RootfsPath: dmDev.DevicePath,
|
||||
@ -1033,6 +1036,33 @@ func (m *Manager) GetClient(sandboxID string) (*envdclient.Client, error) {
|
||||
return sb.client, nil
|
||||
}
|
||||
|
||||
// Ping resets the inactivity timer for a running sandbox.
|
||||
func (m *Manager) Ping(sandboxID string) error {
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
|
||||
sb, ok := m.boxes[sandboxID]
|
||||
if !ok {
|
||||
return fmt.Errorf("sandbox not found: %s", sandboxID)
|
||||
}
|
||||
if sb.Status != models.StatusRunning {
|
||||
return fmt.Errorf("sandbox %s is not running (status: %s)", sandboxID, sb.Status)
|
||||
}
|
||||
sb.LastActiveAt = time.Now()
|
||||
return nil
|
||||
}
|
||||
|
||||
// DrainAutoPausedIDs returns and clears the list of sandbox IDs that were
|
||||
// automatically paused by the TTL reaper since the last call.
|
||||
func (m *Manager) DrainAutoPausedIDs() []string {
|
||||
m.autoPausedMu.Lock()
|
||||
defer m.autoPausedMu.Unlock()
|
||||
|
||||
ids := m.autoPausedIDs
|
||||
m.autoPausedIDs = nil
|
||||
return ids
|
||||
}
|
||||
|
||||
func (m *Manager) get(sandboxID string) (*sandboxState, error) {
|
||||
m.mu.RLock()
|
||||
defer m.mu.RUnlock()
|
||||
@ -1048,7 +1078,7 @@ func (m *Manager) get(sandboxID string) (*sandboxState, error) {
|
||||
// that have exceeded their TTL (timeout_sec of inactivity).
|
||||
func (m *Manager) StartTTLReaper(ctx context.Context) {
|
||||
go func() {
|
||||
ticker := time.NewTicker(10 * time.Second)
|
||||
ticker := time.NewTicker(2 * time.Second)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
@ -1064,7 +1094,7 @@ func (m *Manager) StartTTLReaper(ctx context.Context) {
|
||||
}()
|
||||
}
|
||||
|
||||
func (m *Manager) reapExpired(ctx context.Context) {
|
||||
func (m *Manager) reapExpired(_ context.Context) {
|
||||
m.mu.RLock()
|
||||
var expired []string
|
||||
now := time.Now()
|
||||
@ -1072,7 +1102,7 @@ func (m *Manager) reapExpired(ctx context.Context) {
|
||||
if sb.TimeoutSec <= 0 {
|
||||
continue
|
||||
}
|
||||
if sb.Status != models.StatusRunning && sb.Status != models.StatusPaused {
|
||||
if sb.Status != models.StatusRunning {
|
||||
continue
|
||||
}
|
||||
if now.Sub(sb.LastActiveAt) > time.Duration(sb.TimeoutSec)*time.Second {
|
||||
@ -1082,10 +1112,23 @@ func (m *Manager) reapExpired(ctx context.Context) {
|
||||
m.mu.RUnlock()
|
||||
|
||||
for _, id := range expired {
|
||||
slog.Info("TTL expired, destroying sandbox", "id", id)
|
||||
if err := m.Destroy(ctx, id); err != nil {
|
||||
slog.Warn("TTL reap failed", "id", id, "error", err)
|
||||
slog.Info("TTL expired, auto-pausing sandbox", "id", id)
|
||||
// Use a detached context so that an app shutdown does not cancel
|
||||
// a pause mid-flight, which would leave the VM frozen without a
|
||||
// valid snapshot.
|
||||
pauseCtx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
|
||||
err := m.Pause(pauseCtx, id)
|
||||
cancel()
|
||||
if err != nil {
|
||||
slog.Warn("TTL auto-pause failed, destroying sandbox", "id", id, "error", err)
|
||||
if destroyErr := m.Destroy(context.Background(), id); destroyErr != nil {
|
||||
slog.Warn("TTL destroy after failed pause also failed", "id", id, "error", destroyErr)
|
||||
}
|
||||
continue
|
||||
}
|
||||
m.autoPausedMu.Lock()
|
||||
m.autoPausedIDs = append(m.autoPausedIDs, id)
|
||||
m.autoPausedMu.Unlock()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user