1
0
forked from wrenn/wrenn

Pre-pause snapshot signal to prevent Go runtime crash on restore

envd crashes with "fatal error: bad summary data" after Firecracker
snapshot/restore because the page allocator radix tree is inconsistent
when vCPUs are frozen mid-allocation. The port scanner goroutine
allocates heavily every second, making it the primary trigger.

Add POST /snapshot/prepare to envd — the host agent calls it before
vm.Pause to quiesce continuous goroutines and force GC. On restore,
PostInit restarts the port subsystem via the existing /init endpoint.

- New PortSubsystem abstraction with Start/Stop/Restart lifecycle
- Context-based goroutine cancellation (replaces irreversible channel close)
- Context-aware Signal to prevent scanner/forwarder deadlock
- Fix forwarder goroutine leak (was spinning forever on closed channel)
- Kill socat children on stop to prevent orphans across snapshots
- Fix double cmd.Wait panic (exec.Command instead of CommandContext)
This commit is contained in:
2026-04-13 05:21:10 +06:00
parent 117c46a386
commit 962860ba74
15 changed files with 317 additions and 59 deletions

View File

@ -1,8 +1,10 @@
// SPDX-License-Identifier: Apache-2.0
// Modifications by M/S Omukk
package port
import (
"context"
"sync"
"time"
@ -10,8 +12,7 @@ import (
)
type Scanner struct {
scanExit chan struct{}
period time.Duration
period time.Duration
// Plain mutex-protected map instead of concurrent-map. The concurrent-map
// library's Items() spawns goroutines and uses a WaitGroup internally,
@ -20,15 +21,10 @@ type Scanner struct {
subs map[string]*ScannerSubscriber
}
func (s *Scanner) Destroy() {
close(s.scanExit)
}
func NewScanner(period time.Duration) *Scanner {
return &Scanner{
period: period,
subs: make(map[string]*ScannerSubscriber),
scanExit: make(chan struct{}),
period: period,
subs: make(map[string]*ScannerSubscriber),
}
}
@ -51,7 +47,8 @@ func (s *Scanner) Unsubscribe(sub *ScannerSubscriber) {
}
// ScanAndBroadcast starts scanning open TCP ports and broadcasts every open port to all subscribers.
func (s *Scanner) ScanAndBroadcast() {
// It exits when ctx is cancelled.
func (s *Scanner) ScanAndBroadcast(ctx context.Context) {
for {
// Read directly from /proc/net/tcp and /proc/net/tcp6 instead of
// using gopsutil's net.Connections(), which walks /proc/{pid}/fd
@ -60,15 +57,14 @@ func (s *Scanner) ScanAndBroadcast() {
s.mu.RLock()
for _, sub := range s.subs {
sub.Signal(conns)
sub.Signal(ctx, conns)
}
s.mu.RUnlock()
select {
case <-s.scanExit:
case <-ctx.Done():
return
default:
time.Sleep(s.period)
case <-time.After(s.period):
}
}
}