forked from wrenn/wrenn
Add auto-pause TTL and ping endpoint for sandbox inactivity management
Replace the existing auto-destroy TTL behavior with auto-pause: when a
sandbox exceeds its timeout_sec of inactivity, the TTL reaper now pauses
it (snapshot + teardown) instead of destroying it, preserving the ability
to resume later.
Key changes:
- TTL reaper calls Pause instead of Destroy, with fallback to Destroy if
pause fails (e.g. Firecracker process already gone)
- New PingSandbox RPC resets the in-memory LastActiveAt timer
- New POST /v1/sandboxes/{id}/ping REST endpoint resets both agent memory
and DB last_active_at
- ListSandboxes RPC now includes auto_paused_sandbox_ids so the reconciler
can distinguish auto-paused sandboxes from crashed ones in a single call
- Reconciler polls every 5s (was 30s) and marks auto-paused as "paused"
vs orphaned as "stopped"
- Resume RPC accepts timeout_sec from DB so TTL survives pause/resume cycles
- Reaper checks every 2s (was 10s) and uses a detached context to avoid
incomplete pauses on app shutdown
- Default timeout_sec changed from 300 to 0 (no auto-pause unless requested)
This commit is contained in:
@ -40,6 +40,9 @@ type Manager struct {
|
||||
mu sync.RWMutex
|
||||
boxes map[string]*sandboxState
|
||||
stopCh chan struct{}
|
||||
|
||||
autoPausedMu sync.Mutex
|
||||
autoPausedIDs []string
|
||||
}
|
||||
|
||||
// sandboxState holds the runtime state for a single sandbox.
|
||||
@ -459,7 +462,7 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error {
|
||||
|
||||
// Resume restores a paused sandbox from its snapshot using UFFD for
|
||||
// lazy memory loading. The sandbox gets a new network slot.
|
||||
func (m *Manager) Resume(ctx context.Context, sandboxID string) (*models.Sandbox, error) {
|
||||
func (m *Manager) Resume(ctx context.Context, sandboxID string, timeoutSec int) (*models.Sandbox, error) {
|
||||
snapDir := m.cfg.SnapshotsDir
|
||||
if !snapshot.Exists(snapDir, sandboxID) {
|
||||
return nil, fmt.Errorf("no snapshot found for sandbox %s", sandboxID)
|
||||
@ -575,7 +578,7 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string) (*models.Sandbox
|
||||
SandboxID: sandboxID,
|
||||
KernelPath: m.cfg.KernelPath,
|
||||
RootfsPath: dmDev.DevicePath,
|
||||
VCPUs: 1, // Placeholder; overridden by snapshot.
|
||||
VCPUs: 1, // Placeholder; overridden by snapshot.
|
||||
MemoryMB: int(header.Metadata.Size / (1024 * 1024)), // Placeholder; overridden by snapshot.
|
||||
NetworkNamespace: slot.NamespaceID,
|
||||
TapDevice: slot.TapName,
|
||||
@ -622,7 +625,7 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string) (*models.Sandbox
|
||||
Template: "",
|
||||
VCPUs: vmCfg.VCPUs,
|
||||
MemoryMB: vmCfg.MemoryMB,
|
||||
TimeoutSec: 0,
|
||||
TimeoutSec: timeoutSec,
|
||||
SlotIndex: slotIdx,
|
||||
HostIP: slot.HostIP,
|
||||
RootfsPath: dmDev.DevicePath,
|
||||
@ -1033,6 +1036,33 @@ func (m *Manager) GetClient(sandboxID string) (*envdclient.Client, error) {
|
||||
return sb.client, nil
|
||||
}
|
||||
|
||||
// Ping resets the inactivity timer for a running sandbox.
|
||||
func (m *Manager) Ping(sandboxID string) error {
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
|
||||
sb, ok := m.boxes[sandboxID]
|
||||
if !ok {
|
||||
return fmt.Errorf("sandbox not found: %s", sandboxID)
|
||||
}
|
||||
if sb.Status != models.StatusRunning {
|
||||
return fmt.Errorf("sandbox %s is not running (status: %s)", sandboxID, sb.Status)
|
||||
}
|
||||
sb.LastActiveAt = time.Now()
|
||||
return nil
|
||||
}
|
||||
|
||||
// DrainAutoPausedIDs returns and clears the list of sandbox IDs that were
|
||||
// automatically paused by the TTL reaper since the last call.
|
||||
func (m *Manager) DrainAutoPausedIDs() []string {
|
||||
m.autoPausedMu.Lock()
|
||||
defer m.autoPausedMu.Unlock()
|
||||
|
||||
ids := m.autoPausedIDs
|
||||
m.autoPausedIDs = nil
|
||||
return ids
|
||||
}
|
||||
|
||||
func (m *Manager) get(sandboxID string) (*sandboxState, error) {
|
||||
m.mu.RLock()
|
||||
defer m.mu.RUnlock()
|
||||
@ -1048,7 +1078,7 @@ func (m *Manager) get(sandboxID string) (*sandboxState, error) {
|
||||
// that have exceeded their TTL (timeout_sec of inactivity).
|
||||
func (m *Manager) StartTTLReaper(ctx context.Context) {
|
||||
go func() {
|
||||
ticker := time.NewTicker(10 * time.Second)
|
||||
ticker := time.NewTicker(2 * time.Second)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
@ -1064,7 +1094,7 @@ func (m *Manager) StartTTLReaper(ctx context.Context) {
|
||||
}()
|
||||
}
|
||||
|
||||
func (m *Manager) reapExpired(ctx context.Context) {
|
||||
func (m *Manager) reapExpired(_ context.Context) {
|
||||
m.mu.RLock()
|
||||
var expired []string
|
||||
now := time.Now()
|
||||
@ -1072,7 +1102,7 @@ func (m *Manager) reapExpired(ctx context.Context) {
|
||||
if sb.TimeoutSec <= 0 {
|
||||
continue
|
||||
}
|
||||
if sb.Status != models.StatusRunning && sb.Status != models.StatusPaused {
|
||||
if sb.Status != models.StatusRunning {
|
||||
continue
|
||||
}
|
||||
if now.Sub(sb.LastActiveAt) > time.Duration(sb.TimeoutSec)*time.Second {
|
||||
@ -1082,10 +1112,23 @@ func (m *Manager) reapExpired(ctx context.Context) {
|
||||
m.mu.RUnlock()
|
||||
|
||||
for _, id := range expired {
|
||||
slog.Info("TTL expired, destroying sandbox", "id", id)
|
||||
if err := m.Destroy(ctx, id); err != nil {
|
||||
slog.Warn("TTL reap failed", "id", id, "error", err)
|
||||
slog.Info("TTL expired, auto-pausing sandbox", "id", id)
|
||||
// Use a detached context so that an app shutdown does not cancel
|
||||
// a pause mid-flight, which would leave the VM frozen without a
|
||||
// valid snapshot.
|
||||
pauseCtx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
|
||||
err := m.Pause(pauseCtx, id)
|
||||
cancel()
|
||||
if err != nil {
|
||||
slog.Warn("TTL auto-pause failed, destroying sandbox", "id", id, "error", err)
|
||||
if destroyErr := m.Destroy(context.Background(), id); destroyErr != nil {
|
||||
slog.Warn("TTL destroy after failed pause also failed", "id", id, "error", destroyErr)
|
||||
}
|
||||
continue
|
||||
}
|
||||
m.autoPausedMu.Lock()
|
||||
m.autoPausedIDs = append(m.autoPausedIDs, id)
|
||||
m.autoPausedMu.Unlock()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user