From 1ca10230a9f1c8641545510d7888cfaeca283d02 Mon Sep 17 00:00:00 2001 From: pptx704 Date: Sun, 29 Mar 2026 02:14:30 +0600 Subject: [PATCH] Prefix network namespaces with wrenn-, add stale cleanup, lower diff cap Rename ns-{idx} to wrenn-ns-{idx} and veth-{idx} to wrenn-veth-{idx} to avoid collisions with other tools. Add CleanupStaleNamespaces() at agent startup to remove orphaned namespaces, veths, iptables rules, and routes from a previous crash. Lower maxDiffGenerations from 10 to 8 to prevent Go runtime memory corruption from snapshot/restore drift. --- cmd/host-agent/main.go | 4 +- internal/network/setup.go | 82 ++++++++++++++++++++++++++++++++++++- internal/sandbox/manager.go | 8 +++- 3 files changed, 89 insertions(+), 5 deletions(-) diff --git a/cmd/host-agent/main.go b/cmd/host-agent/main.go index 6665ba9..a6571df 100644 --- a/cmd/host-agent/main.go +++ b/cmd/host-agent/main.go @@ -16,6 +16,7 @@ import ( "git.omukk.dev/wrenn/sandbox/internal/devicemapper" "git.omukk.dev/wrenn/sandbox/internal/hostagent" + "git.omukk.dev/wrenn/sandbox/internal/network" "git.omukk.dev/wrenn/sandbox/internal/sandbox" "git.omukk.dev/wrenn/sandbox/proto/hostagent/gen/hostagentv1connect" ) @@ -42,8 +43,9 @@ func main() { slog.Warn("failed to enable ip_forward", "error", err) } - // Clean up any stale dm-snapshot devices from a previous crash. + // Clean up stale resources from a previous crash. devicemapper.CleanupStaleDevices() + network.CleanupStaleNamespaces() listenAddr := envOrDefault("WRENN_HOST_LISTEN_ADDR", ":50051") rootDir := envOrDefault("WRENN_DIR", "/var/lib/wrenn") diff --git a/internal/network/setup.go b/internal/network/setup.go index 70a8a54..ee06d39 100644 --- a/internal/network/setup.go +++ b/internal/network/setup.go @@ -5,13 +5,91 @@ import ( "fmt" "log/slog" "net" + "os" "os/exec" "runtime" + "strings" "github.com/vishvananda/netlink" "github.com/vishvananda/netns" ) +const nsPrefix = "wrenn-ns-" + +// CleanupStaleNamespaces removes leftover wrenn network namespaces from a +// previous crash. Called once at agent startup. +func CleanupStaleNamespaces() { + entries, err := os.ReadDir("/run/netns") + if err != nil { + return // no /run/netns or unreadable — nothing to clean + } + for _, e := range entries { + name := e.Name() + if !strings.HasPrefix(name, nsPrefix) { + continue + } + // Also remove the associated veth from the host side. + vethName := "wrenn-veth-" + strings.TrimPrefix(name, nsPrefix) + if link, err := netlink.LinkByName(vethName); err == nil { + _ = netlink.LinkDel(link) + } + if err := netns.DeleteNamed(name); err != nil { + slog.Warn("failed to remove stale namespace", "ns", name, "error", err) + } else { + slog.Info("removed stale namespace", "ns", name) + } + } + + // Clean up any stale wrenn iptables rules referencing old veth interfaces. + cleanupStaleIptablesRules() +} + +// cleanupStaleIptablesRules removes host iptables rules that reference +// wrenn-veth interfaces no longer present on the system. +func cleanupStaleIptablesRules() { + for _, table := range []string{"filter", "nat"} { + cmd := exec.Command("iptables-save", "-t", table) + out, err := cmd.Output() + if err != nil { + continue + } + for _, line := range strings.Split(string(out), "\n") { + if !strings.Contains(line, "wrenn-veth-") { + continue + } + // Lines look like "-A FORWARD -i wrenn-veth-1 -o wlo1 -j ACCEPT" + // Convert -A to -D to delete the rule. + if !strings.HasPrefix(line, "-A ") { + continue + } + delRule := "-D " + line[3:] + args := strings.Fields(delRule) + delCmd := exec.Command("iptables", append([]string{"-t", table}, args...)...) + if err := delCmd.Run(); err != nil { + slog.Debug("failed to remove stale iptables rule", "rule", line, "error", err) + } + } + } + + // Also remove stale host routes to 10.11.0.x via wrenn-veth interfaces. + routes, err := netlink.RouteList(nil, netlink.FAMILY_V4) + if err != nil { + return + } + for _, r := range routes { + if r.LinkIndex == 0 { + continue + } + link, err := netlink.LinkByIndex(r.LinkIndex) + if err != nil { + continue + } + if strings.HasPrefix(link.Attrs().Name, "wrenn-veth-") { + _ = netlink.RouteDel(&r) + } + } +} + const ( // Fixed addresses inside each network namespace (safe because each // sandbox gets its own netns). @@ -84,8 +162,8 @@ func NewSlot(index int) *Slot { GuestIP: guestIP, GuestNetMask: guestNetMask, TapName: tapName, - NamespaceID: fmt.Sprintf("ns-%d", index), - VethName: fmt.Sprintf("veth-%d", index), + NamespaceID: fmt.Sprintf("wrenn-ns-%d", index), + VethName: fmt.Sprintf("wrenn-veth-%d", index), } } diff --git a/internal/sandbox/manager.go b/internal/sandbox/manager.go index 4e97c49..84e04aa 100644 --- a/internal/sandbox/manager.go +++ b/internal/sandbox/manager.go @@ -73,8 +73,12 @@ type snapshotParent struct { } // maxDiffGenerations caps how many incremental diff generations we chain -// before falling back to a Full snapshot to collapse the chain. -const maxDiffGenerations = 10 +// before falling back to a Full snapshot to collapse the chain. Firecracker +// snapshot/restore of a Go process (envd) accumulates runtime memory state +// drift; empirically, ~10 diff-based cycles corrupt the Go page allocator. +// A Full snapshot resets the generation counter and produces a clean base, +// preventing the crash. +const maxDiffGenerations = 8 // New creates a new sandbox manager. func New(cfg Config) *Manager {