1
0
forked from wrenn/wrenn

Pre-pause snapshot signal to prevent Go runtime crash on restore

envd crashes with "fatal error: bad summary data" after Firecracker
snapshot/restore because the page allocator radix tree is inconsistent
when vCPUs are frozen mid-allocation. The port scanner goroutine
allocates heavily every second, making it the primary trigger.

Add POST /snapshot/prepare to envd — the host agent calls it before
vm.Pause to quiesce continuous goroutines and force GC. On restore,
PostInit restarts the port subsystem via the existing /init endpoint.

- New PortSubsystem abstraction with Start/Stop/Restart lifecycle
- Context-based goroutine cancellation (replaces irreversible channel close)
- Context-aware Signal to prevent scanner/forwarder deadlock
- Fix forwarder goroutine leak (was spinning forever on closed channel)
- Kill socat children on stop to prevent orphans across snapshots
- Fix double cmd.Wait panic (exec.Command instead of CommandContext)
This commit is contained in:
2026-04-13 05:21:10 +06:00
parent 117c46a386
commit 962860ba74
15 changed files with 317 additions and 59 deletions

View File

@ -1,4 +1,5 @@
// SPDX-License-Identifier: Apache-2.0
// Modifications by M/S Omukk
// portf (port forward) periodaically scans opened TCP ports on the 127.0.0.1 (or localhost)
// and launches `socat` process for every such port in the background.
@ -80,8 +81,16 @@ func (f *Forwarder) StartForwarding(ctx context.Context) {
}
for {
// procs is an array of currently opened ports.
if procs, ok := <-f.scannerSubscriber.Messages; ok {
select {
case <-ctx.Done():
f.stopAllForwarding()
return
case procs, ok := <-f.scannerSubscriber.Messages:
if !ok {
f.stopAllForwarding()
return
}
// Now we are going to refresh all ports that are being forwarded in the `ports` map. Maybe add new ones
// and maybe remove some.
@ -133,11 +142,22 @@ func (f *Forwarder) StartForwarding(ctx context.Context) {
}
}
func (f *Forwarder) startPortForwarding(ctx context.Context, p *PortToForward) {
func (f *Forwarder) stopAllForwarding() {
for _, p := range f.ports {
f.stopPortForwarding(p)
}
f.ports = make(map[string]*PortToForward)
}
func (f *Forwarder) startPortForwarding(_ context.Context, p *PortToForward) {
// https://unix.stackexchange.com/questions/311492/redirect-application-listening-on-localhost-to-listening-on-external-interface
// socat -d -d TCP4-LISTEN:4000,bind=169.254.0.21,fork TCP4:localhost:4000
// reuseaddr is used to fix the "Address already in use" error when restarting socat quickly.
cmd := exec.CommandContext(ctx,
//
// We use exec.Command (not CommandContext) because stopAllForwarding kills
// socat via SIGKILL to the process group. CommandContext would also call
// cmd.Wait() on context cancellation, racing with the wait goroutine below.
cmd := exec.Command(
"socat", "-d", "-d", "-d",
fmt.Sprintf("TCP4-LISTEN:%v,bind=%s,reuseaddr,fork", p.port, f.sourceIP.To4()),
fmt.Sprintf("TCP%d:localhost:%v", p.family, p.port),