1
0
forked from wrenn/wrenn

Pre-pause snapshot signal to prevent Go runtime crash on restore

envd crashes with "fatal error: bad summary data" after Firecracker
snapshot/restore because the page allocator radix tree is inconsistent
when vCPUs are frozen mid-allocation. The port scanner goroutine
allocates heavily every second, making it the primary trigger.

Add POST /snapshot/prepare to envd — the host agent calls it before
vm.Pause to quiesce continuous goroutines and force GC. On restore,
PostInit restarts the port subsystem via the existing /init endpoint.

- New PortSubsystem abstraction with Start/Stop/Restart lifecycle
- Context-based goroutine cancellation (replaces irreversible channel close)
- Context-aware Signal to prevent scanner/forwarder deadlock
- Fix forwarder goroutine leak (was spinning forever on closed channel)
- Kill socat children on stop to prevent orphans across snapshots
- Fix double cmd.Wait panic (exec.Command instead of CommandContext)
This commit is contained in:
2026-04-13 05:21:10 +06:00
parent 117c46a386
commit 962860ba74
15 changed files with 317 additions and 59 deletions

View File

@ -190,7 +190,14 @@ func main() {
processLogger := l.With().Str("logger", "process").Logger()
processService := processRpc.Handle(m, &processLogger, defaults, cgroupManager)
service := api.New(&envLogger, defaults, mmdsChan, isNotFC)
// Port scanner and forwarder are managed by PortSubsystem, which
// supports stop/restart across Firecracker snapshot/restore cycles.
portLogger := l.With().Str("logger", "port-forwarder").Logger()
portSubsystem := publicport.NewPortSubsystem(&portLogger, cgroupManager, portScannerInterval)
portSubsystem.Start(ctx)
defer portSubsystem.Stop()
service := api.New(&envLogger, defaults, mmdsChan, isNotFC, ctx, portSubsystem)
handler := api.HandlerFromMux(service, m)
middleware := authn.NewMiddleware(permissions.AuthenticateUsername)
@ -229,16 +236,6 @@ func main() {
}
}
// Bind all open ports on 127.0.0.1 and localhost to the eth0 interface
portScanner := publicport.NewScanner(portScannerInterval)
defer portScanner.Destroy()
portLogger := l.With().Str("logger", "port-forwarder").Logger()
portForwarder := publicport.NewForwarder(&portLogger, portScanner, cgroupManager)
go portForwarder.StartForwarding(ctx)
go portScanner.ScanAndBroadcast()
err := s.ListenAndServe()
if err != nil {
log.Fatalf("error starting server: %v", err)