forked from wrenn/wrenn
fix: inflate balloon before snapshot to reduce memfile size
Firecracker dumps the entire VM memory region regardless of guest usage. A 20GB VM using 500MB still produces a ~20GB memfile because freed pages retain stale data (non-zero blocks). Inflate the balloon device before snapshot to reclaim free guest memory. Balloon pages become zero from FC's perspective, allowing ProcessMemfile to skip them. This reduces memfile size from ~20GB to ~1-2GB for lightly-used VMs. - Pause: read guest memory usage, inflate balloon to reclaim free pages, wait 2s for guest kernel to process, then proceed - Resume: deflate balloon to 0 after PostInit so guest gets full memory back - createFromSnapshot: same deflation since template snapshots inherit inflated balloon state - All balloon ops are best-effort with debug logging on failure
This commit is contained in:
@ -409,10 +409,42 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error {
|
|||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
|
// Step 0d: Inflate balloon to reclaim free guest memory before snapshot.
|
||||||
|
// Freed pages become zero from FC's perspective, so ProcessMemfile can
|
||||||
|
// skip them → dramatically smaller memfile (e.g. 20GB → 1GB).
|
||||||
|
// Best-effort: balloon may not be available (e.g. snapshot-restored VMs
|
||||||
|
// from before balloon was configured).
|
||||||
|
func() {
|
||||||
|
memUsed, err := readEnvdMemUsed(sb.client)
|
||||||
|
if err != nil {
|
||||||
|
slog.Debug("pause: could not read guest memory, skipping balloon inflate", "id", sandboxID, "error", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
usedMiB := int(memUsed / (1024 * 1024))
|
||||||
|
// Leave 2x used memory + 128MB headroom for kernel/envd.
|
||||||
|
keepMiB := max(usedMiB*2, 256) + 128
|
||||||
|
inflateMiB := sb.MemoryMB - keepMiB
|
||||||
|
if inflateMiB <= 0 {
|
||||||
|
slog.Debug("pause: not enough free memory for balloon inflate", "id", sandboxID, "used_mib", usedMiB, "total_mib", sb.MemoryMB)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
balloonCtx, balloonCancel := context.WithTimeout(ctx, 10*time.Second)
|
||||||
|
defer balloonCancel()
|
||||||
|
if err := m.vm.UpdateBalloon(balloonCtx, sandboxID, inflateMiB); err != nil {
|
||||||
|
slog.Debug("pause: balloon inflate failed (non-fatal)", "id", sandboxID, "error", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
// Give guest kernel time to process balloon requests and release pages.
|
||||||
|
time.Sleep(2 * time.Second)
|
||||||
|
slog.Info("pause: balloon inflated", "id", sandboxID, "inflate_mib", inflateMiB, "guest_used_mib", usedMiB)
|
||||||
|
}()
|
||||||
|
|
||||||
pauseStart := time.Now()
|
pauseStart := time.Now()
|
||||||
|
|
||||||
// Step 1: Pause the VM (freeze vCPUs).
|
// Step 1: Pause the VM (freeze vCPUs).
|
||||||
if err := m.vm.Pause(ctx, sandboxID); err != nil {
|
if err := m.vm.Pause(ctx, sandboxID); err != nil {
|
||||||
|
// Deflate balloon before returning so sandbox is usable.
|
||||||
|
_ = m.vm.UpdateBalloon(context.Background(), sandboxID, 0)
|
||||||
sb.connTracker.Reset()
|
sb.connTracker.Reset()
|
||||||
return fmt.Errorf("pause VM: %w", err)
|
return fmt.Errorf("pause VM: %w", err)
|
||||||
}
|
}
|
||||||
@ -813,6 +845,12 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string, timeoutSec int,
|
|||||||
slog.Warn("post-init failed after resume, metadata files may be stale", "sandbox", sandboxID, "error", err)
|
slog.Warn("post-init failed after resume, metadata files may be stale", "sandbox", sandboxID, "error", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Deflate balloon — the snapshot was taken with an inflated balloon to
|
||||||
|
// reduce memfile size, so restore the guest's full memory allocation.
|
||||||
|
if err := m.vm.UpdateBalloon(ctx, sandboxID, 0); err != nil {
|
||||||
|
slog.Debug("resume: balloon deflate failed (non-fatal)", "id", sandboxID, "error", err)
|
||||||
|
}
|
||||||
|
|
||||||
// Fetch envd version (best-effort).
|
// Fetch envd version (best-effort).
|
||||||
envdVersion, _ := client.FetchVersion(ctx)
|
envdVersion, _ := client.FetchVersion(ctx)
|
||||||
|
|
||||||
@ -1253,6 +1291,11 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID string, team
|
|||||||
slog.Warn("post-init failed after template restore, metadata files may be stale", "sandbox", sandboxID, "error", err)
|
slog.Warn("post-init failed after template restore, metadata files may be stale", "sandbox", sandboxID, "error", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Deflate balloon — template snapshot was taken with an inflated balloon.
|
||||||
|
if err := m.vm.UpdateBalloon(ctx, sandboxID, 0); err != nil {
|
||||||
|
slog.Debug("create-from-snapshot: balloon deflate failed (non-fatal)", "id", sandboxID, "error", err)
|
||||||
|
}
|
||||||
|
|
||||||
// Fetch envd version (best-effort).
|
// Fetch envd version (best-effort).
|
||||||
envdVersion, _ := client.FetchVersion(ctx)
|
envdVersion, _ := client.FetchVersion(ctx)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user