1
0
forked from wrenn/wrenn

fix: inflate balloon before snapshot to reduce memfile size

Firecracker dumps the entire VM memory region regardless of guest
usage. A 20GB VM using 500MB still produces a ~20GB memfile because
freed pages retain stale data (non-zero blocks).

Inflate the balloon device before snapshot to reclaim free guest
memory. Balloon pages become zero from FC's perspective, allowing
ProcessMemfile to skip them. This reduces memfile size from ~20GB
to ~1-2GB for lightly-used VMs.

- Pause: read guest memory usage, inflate balloon to reclaim free
  pages, wait 2s for guest kernel to process, then proceed
- Resume: deflate balloon to 0 after PostInit so guest gets full
  memory back
- createFromSnapshot: same deflation since template snapshots
  inherit inflated balloon state
- All balloon ops are best-effort with debug logging on failure
This commit is contained in:
2026-05-05 15:38:04 +06:00
parent 51b5d7b3ba
commit 38799770db

View File

@ -409,10 +409,42 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error {
} }
}() }()
// Step 0d: Inflate balloon to reclaim free guest memory before snapshot.
// Freed pages become zero from FC's perspective, so ProcessMemfile can
// skip them → dramatically smaller memfile (e.g. 20GB → 1GB).
// Best-effort: balloon may not be available (e.g. snapshot-restored VMs
// from before balloon was configured).
func() {
memUsed, err := readEnvdMemUsed(sb.client)
if err != nil {
slog.Debug("pause: could not read guest memory, skipping balloon inflate", "id", sandboxID, "error", err)
return
}
usedMiB := int(memUsed / (1024 * 1024))
// Leave 2x used memory + 128MB headroom for kernel/envd.
keepMiB := max(usedMiB*2, 256) + 128
inflateMiB := sb.MemoryMB - keepMiB
if inflateMiB <= 0 {
slog.Debug("pause: not enough free memory for balloon inflate", "id", sandboxID, "used_mib", usedMiB, "total_mib", sb.MemoryMB)
return
}
balloonCtx, balloonCancel := context.WithTimeout(ctx, 10*time.Second)
defer balloonCancel()
if err := m.vm.UpdateBalloon(balloonCtx, sandboxID, inflateMiB); err != nil {
slog.Debug("pause: balloon inflate failed (non-fatal)", "id", sandboxID, "error", err)
return
}
// Give guest kernel time to process balloon requests and release pages.
time.Sleep(2 * time.Second)
slog.Info("pause: balloon inflated", "id", sandboxID, "inflate_mib", inflateMiB, "guest_used_mib", usedMiB)
}()
pauseStart := time.Now() pauseStart := time.Now()
// Step 1: Pause the VM (freeze vCPUs). // Step 1: Pause the VM (freeze vCPUs).
if err := m.vm.Pause(ctx, sandboxID); err != nil { if err := m.vm.Pause(ctx, sandboxID); err != nil {
// Deflate balloon before returning so sandbox is usable.
_ = m.vm.UpdateBalloon(context.Background(), sandboxID, 0)
sb.connTracker.Reset() sb.connTracker.Reset()
return fmt.Errorf("pause VM: %w", err) return fmt.Errorf("pause VM: %w", err)
} }
@ -813,6 +845,12 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string, timeoutSec int,
slog.Warn("post-init failed after resume, metadata files may be stale", "sandbox", sandboxID, "error", err) slog.Warn("post-init failed after resume, metadata files may be stale", "sandbox", sandboxID, "error", err)
} }
// Deflate balloon — the snapshot was taken with an inflated balloon to
// reduce memfile size, so restore the guest's full memory allocation.
if err := m.vm.UpdateBalloon(ctx, sandboxID, 0); err != nil {
slog.Debug("resume: balloon deflate failed (non-fatal)", "id", sandboxID, "error", err)
}
// Fetch envd version (best-effort). // Fetch envd version (best-effort).
envdVersion, _ := client.FetchVersion(ctx) envdVersion, _ := client.FetchVersion(ctx)
@ -1253,6 +1291,11 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID string, team
slog.Warn("post-init failed after template restore, metadata files may be stale", "sandbox", sandboxID, "error", err) slog.Warn("post-init failed after template restore, metadata files may be stale", "sandbox", sandboxID, "error", err)
} }
// Deflate balloon — template snapshot was taken with an inflated balloon.
if err := m.vm.UpdateBalloon(ctx, sandboxID, 0); err != nil {
slog.Debug("create-from-snapshot: balloon deflate failed (non-fatal)", "id", sandboxID, "error", err)
}
// Fetch envd version (best-effort). // Fetch envd version (best-effort).
envdVersion, _ := client.FetchVersion(ctx) envdVersion, _ := client.FetchVersion(ctx)