forked from wrenn/wrenn
Co-authored-by: Tasnim Kabir Sadik <tksadik@omukk.dev> Reviewed-on: wrenn/wrenn#50
187 lines
5.1 KiB
Go
187 lines
5.1 KiB
Go
// Package sandbox: post-snapshot hole punching for memory-ranges files.
|
|
//
|
|
// CH v52's SEEK_DATA/SEEK_HOLE snapshot writer only skips ranges already
|
|
// hole in the source memfd. Pages the guest never reported as free are
|
|
// written verbatim — including pages whose contents happen to be all zero
|
|
// (fresh allocations the guest scribbled then released without telling the
|
|
// balloon driver). Walking the resulting file and punching any 4 KiB block
|
|
// of zeros recovers that space without any guest cooperation.
|
|
package sandbox
|
|
|
|
import (
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"log/slog"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
|
|
"golang.org/x/sys/unix"
|
|
)
|
|
|
|
const (
|
|
// punchBlockSize is the granularity at which we test for zero runs and
|
|
// issue FALLOC_FL_PUNCH_HOLE. Matches the kernel page size and the
|
|
// minimum hole size on ext4.
|
|
punchBlockSize = 4096
|
|
|
|
// punchReadSize is the IO chunk size used by the scan loop. We read
|
|
// many blocks per syscall and split them in-memory so a 20 GiB
|
|
// memory-ranges file costs ~20K read(2) syscalls instead of ~5M.
|
|
// Crucial under single-disk hosts where each syscall otherwise
|
|
// contends with sshd / journal IO.
|
|
punchReadSize = 1 << 20 // 1 MiB = 256 blocks
|
|
)
|
|
|
|
// punchZeroPagesInDir runs punchZeroPages on every memory* file in dir.
|
|
// CH writes its memory dump as one or more files prefixed "memory" inside
|
|
// the snapshot directory; everything else (config.json, state.json) is
|
|
// metadata and untouched.
|
|
func punchZeroPagesInDir(dir string) {
|
|
entries, err := os.ReadDir(dir)
|
|
if err != nil {
|
|
slog.Warn("punch: read snapshot dir", "dir", dir, "error", err)
|
|
return
|
|
}
|
|
for _, e := range entries {
|
|
if e.IsDir() || !strings.HasPrefix(e.Name(), "memory") {
|
|
continue
|
|
}
|
|
path := filepath.Join(dir, e.Name())
|
|
before, after, err := punchZeroPages(path)
|
|
if err != nil {
|
|
slog.Warn("punch: zero-page scan failed", "path", path, "error", err)
|
|
continue
|
|
}
|
|
slog.Info("punch: zero-page scan done",
|
|
"path", path,
|
|
"alloc_before", before,
|
|
"alloc_after", after,
|
|
"reclaimed", before-after)
|
|
}
|
|
}
|
|
|
|
// punchZeroPages scans path block-by-block, batching runs of all-zero 4 KiB
|
|
// blocks and punching them out via FALLOC_FL_PUNCH_HOLE. Existing holes are
|
|
// skipped via SEEK_DATA so a partially-sparse input stays cheap to scan.
|
|
//
|
|
// Returns the file's disk allocation (st_blocks * 512) before and after.
|
|
func punchZeroPages(path string) (int64, int64, error) {
|
|
f, err := os.OpenFile(path, os.O_RDWR, 0)
|
|
if err != nil {
|
|
return 0, 0, err
|
|
}
|
|
defer f.Close()
|
|
|
|
stBefore, err := statBlocks(f)
|
|
if err != nil {
|
|
return 0, 0, fmt.Errorf("stat before: %w", err)
|
|
}
|
|
|
|
fi, err := f.Stat()
|
|
if err != nil {
|
|
return 0, 0, fmt.Errorf("stat: %w", err)
|
|
}
|
|
size := fi.Size()
|
|
|
|
buf := make([]byte, punchReadSize)
|
|
off := int64(0)
|
|
|
|
for off < size {
|
|
// Skip ahead to next data region; nothing to do in holes.
|
|
next, err := f.Seek(off, 3) // SEEK_DATA = 3
|
|
if err != nil {
|
|
if errors.Is(err, io.EOF) || errors.Is(err, unix.ENXIO) {
|
|
break
|
|
}
|
|
return 0, 0, fmt.Errorf("seek_data @ %d: %w", off, err)
|
|
}
|
|
off = next &^ (punchBlockSize - 1) // align down to block
|
|
|
|
// Find end of this data extent.
|
|
endData, err := f.Seek(off, 4) // SEEK_HOLE = 4
|
|
if err != nil {
|
|
return 0, 0, fmt.Errorf("seek_hole @ %d: %w", off, err)
|
|
}
|
|
|
|
// Scan [off, endData) chunk by chunk; batch zero runs across both
|
|
// intra-chunk and inter-chunk boundaries so a contiguous zero
|
|
// region is punched in a single fallocate.
|
|
zeroStart := int64(-1)
|
|
cur := off
|
|
for cur < endData {
|
|
toRead := min(int64(len(buf)), endData-cur)
|
|
n, err := readAt(f, buf[:toRead], cur)
|
|
if err != nil {
|
|
return 0, 0, fmt.Errorf("read @ %d: %w", cur, err)
|
|
}
|
|
if n == 0 {
|
|
break
|
|
}
|
|
// Walk the chunk one block at a time, tracking zero runs.
|
|
for blkOff := 0; blkOff < n; blkOff += punchBlockSize {
|
|
blkEnd := min(blkOff+punchBlockSize, n)
|
|
blk := buf[blkOff:blkEnd]
|
|
blkAbs := cur + int64(blkOff)
|
|
if isZero(blk) && len(blk) == punchBlockSize {
|
|
if zeroStart < 0 {
|
|
zeroStart = blkAbs
|
|
}
|
|
} else if zeroStart >= 0 {
|
|
if err := punch(f, zeroStart, blkAbs-zeroStart); err != nil {
|
|
return 0, 0, err
|
|
}
|
|
zeroStart = -1
|
|
}
|
|
}
|
|
cur += int64(n)
|
|
}
|
|
if zeroStart >= 0 {
|
|
if err := punch(f, zeroStart, cur-zeroStart); err != nil {
|
|
return 0, 0, err
|
|
}
|
|
}
|
|
off = endData
|
|
}
|
|
|
|
stAfter, err := statBlocks(f)
|
|
if err != nil {
|
|
return 0, 0, fmt.Errorf("stat after: %w", err)
|
|
}
|
|
return stBefore, stAfter, nil
|
|
}
|
|
|
|
func punch(f *os.File, off, length int64) error {
|
|
mode := uint32(unix.FALLOC_FL_PUNCH_HOLE | unix.FALLOC_FL_KEEP_SIZE)
|
|
if err := unix.Fallocate(int(f.Fd()), mode, off, length); err != nil {
|
|
return fmt.Errorf("fallocate punch @ %d len %d: %w", off, length, err)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func readAt(f *os.File, buf []byte, off int64) (int, error) {
|
|
n, err := f.ReadAt(buf, off)
|
|
if err == io.EOF {
|
|
return n, nil
|
|
}
|
|
return n, err
|
|
}
|
|
|
|
func isZero(b []byte) bool {
|
|
for _, x := range b {
|
|
if x != 0 {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
|
|
func statBlocks(f *os.File) (int64, error) {
|
|
var st unix.Stat_t
|
|
if err := unix.Fstat(int(f.Fd()), &st); err != nil {
|
|
return 0, err
|
|
}
|
|
return int64(st.Blocks) * 512, nil
|
|
}
|