package sandbox import ( "fmt" "log/slog" "os" "path/filepath" "sort" "strings" "time" "github.com/google/uuid" "git.omukk.dev/wrenn/wrenn/internal/layout" "git.omukk.dev/wrenn/wrenn/internal/models" ) // RestorePausedSandboxes scans WRENN_DIR/sandboxes/ for paused-sandbox // snapshots left behind by a previous agent instance and re-registers them // in m.boxes as StatusPaused. Without this, ListSandboxes would not report // these sandboxes, and the CP's HostMonitor would mark them stopped via // the missing-confirmed-dead reconcile path — orphaning the on-disk // snapshot dir and surfacing a leaked "stopped" sandbox to users. // // Restored sandboxes hold ONLY the slot reservation; VM / network / dm / // loop refcount stay unowned until Resume rebuilds them. baseImagePath is // deliberately NOT set on the in-memory entry so cleanup() does not call // loops.Release on a loop that was never Acquire'd — the registry tolerates // a Release of an unknown key, but a coincident-same-base running sandbox // would have its refcount decremented incorrectly. // // Must be called once at agent startup, AFTER CleanupOrphanPauseDirs (so // .staging-* / .trash-* dirs are gone) and BEFORE the HTTP server starts // serving — otherwise an early Create RPC can race the slot reservation. // // Corrupt snapshot dirs (unparseable meta, missing slot index) are renamed // to .trash-{ts}/ so a future CleanupOrphanPauseDirs sweeps them. Soft // errors are logged; this function never returns an error — startup should // not fail because a single sandbox is unrecoverable. func (m *Manager) RestorePausedSandboxes() { sandboxesDir := layout.SandboxesDir(m.cfg.WrennDir) entries, err := os.ReadDir(sandboxesDir) if err != nil { // Directory does not exist yet — fresh install, nothing to restore. return } type candidate struct { sandboxID string snapDir string meta *snapshotMeta teamID [16]byte templID [16]byte } // Pass 1: parse every snapshot meta. Trash anything unreadable or // missing the slot index — those are crash artefacts, not recoverable // sandboxes. candidates := make([]candidate, 0, len(entries)) for _, e := range entries { if !e.IsDir() { continue } name := e.Name() // Skip CleanupOrphanPauseDirs's territory. If it ran before us // these are already gone; if not, leave them alone. if strings.Contains(name, ".staging-") || strings.Contains(name, ".trash-") { continue } snapDir := layout.PauseSnapshotDir(m.cfg.WrennDir, name) meta, err := readSnapshotMeta(snapDir) if err != nil { slog.Warn("restore: unreadable snapshot meta, trashing dir", "id", name, "error", err) trashCorruptDir(snapDir) continue } if meta.SlotIndex == 0 { slog.Warn("restore: snapshot has no slot_index, trashing dir", "id", name) trashCorruptDir(snapDir) continue } teamBytes, err := parsePlainUUID(meta.TeamID) if err != nil { slog.Warn("restore: bad team_id in snapshot meta", "id", name, "error", err) trashCorruptDir(snapDir) continue } templateBytes, err := parsePlainUUID(meta.TemplateID) if err != nil { slog.Warn("restore: bad template_id in snapshot meta", "id", name, "error", err) trashCorruptDir(snapDir) continue } candidates = append(candidates, candidate{ sandboxID: name, snapDir: snapDir, meta: meta, teamID: teamBytes, templID: templateBytes, }) } // Pass 2: bucket by slot index, pick the newest CreatedAt per slot. // Multiple candidates per slot happen when older paused-sandbox dirs // were left on disk by the pre-fix leak (DB row marked stopped but the // snapshot was never cleaned). The newest is the most likely live one; // older losers are trashed so CleanupOrphanPauseDirs sweeps them on // the next startup. bySlot := make(map[int][]candidate, len(candidates)) for _, c := range candidates { bySlot[c.meta.SlotIndex] = append(bySlot[c.meta.SlotIndex], c) } restored := 0 pruned := 0 for slot, cands := range bySlot { sort.Slice(cands, func(i, j int) bool { return cands[i].meta.CreatedAt.After(cands[j].meta.CreatedAt) }) // Trash every loser. The host_monitor's zombie-cleanup path catches // the winner if its DB row says 'stopped' — but losers never enter // m.boxes and would otherwise sit on disk indefinitely. for _, stale := range cands[1:] { slog.Info("restore: pruning older snapshot for same slot", "id", stale.sandboxID, "slot", slot, "created", stale.meta.CreatedAt, "winner", cands[0].sandboxID, "winner_created", cands[0].meta.CreatedAt) trashCorruptDir(stale.snapDir) pruned++ } winner := cands[0] if err := m.slots.Reserve(winner.meta.SlotIndex); err != nil { // Reserve only fails if another candidate (different slot value // in meta but same numeric index) already grabbed it, or if the // allocator is corrupt. Either way the snapshot is unusable // without a slot, so trash it. slog.Warn("restore: slot reservation failed, trashing dir", "id", winner.sandboxID, "slot", winner.meta.SlotIndex, "error", err) trashCorruptDir(winner.snapDir) pruned++ continue } sb := &sandboxState{ Sandbox: models.Sandbox{ ID: winner.sandboxID, Status: models.StatusPaused, TemplateTeamID: winner.teamID, TemplateID: winner.templID, VCPUs: winner.meta.VCPUs, MemoryMB: winner.meta.MemoryMB, TimeoutSec: winner.meta.TimeoutSec, SlotIndex: winner.meta.SlotIndex, CreatedAt: winner.meta.CreatedAt, // LastActiveAt cosmetic only — TTL reaper ignores non-Running. LastActiveAt: winner.meta.CreatedAt, }, // connTracker must be non-nil: resumeFromMeta calls Reset() on it // unconditionally during rehydration. A nil pointer would panic. connTracker: &ConnTracker{}, // baseImagePath intentionally left empty — see function doc. // sandboxDirOverride intentionally left empty — resumeFromMeta // reads meta.SandboxDir from disk on the resume path. } m.mu.Lock() m.boxes[winner.sandboxID] = sb m.mu.Unlock() restored++ slog.Info("restored paused sandbox", "id", winner.sandboxID, "slot", winner.meta.SlotIndex, "vcpus", winner.meta.VCPUs, "memory_mb", winner.meta.MemoryMB) } if restored > 0 || pruned > 0 { slog.Info("paused sandbox restore complete", "restored", restored, "pruned", pruned) } } // parsePlainUUID turns a standard hyphenated UUID string (as produced by // id.UUIDString) back into the 16-byte representation used by sandboxState. func parsePlainUUID(s string) ([16]byte, error) { if s == "" { return [16]byte{}, fmt.Errorf("empty uuid string") } u, err := uuid.Parse(s) if err != nil { return [16]byte{}, err } return [16]byte(u), nil } // trashCorruptDir renames a corrupt snapshot directory aside so a future // CleanupOrphanPauseDirs sweeps it. Best-effort: if rename fails we log // and move on — leaving the directory in place is safe (restore will skip // it again next startup) but unwanted. func trashCorruptDir(dir string) { parent := filepath.Dir(dir) base := filepath.Base(dir) trash := filepath.Join(parent, fmt.Sprintf("%s.trash-%d", base, time.Now().UnixNano())) if err := os.Rename(dir, trash); err != nil { slog.Warn("restore: failed to trash corrupt snapshot dir", "src", dir, "dst", trash, "error", err) } }