forked from wrenn/wrenn
envd crashes with "fatal error: bad summary data" after Firecracker snapshot/restore because the page allocator radix tree is inconsistent when vCPUs are frozen mid-allocation. The port scanner goroutine allocates heavily every second, making it the primary trigger. Add POST /snapshot/prepare to envd — the host agent calls it before vm.Pause to quiesce continuous goroutines and force GC. On restore, PostInit restarts the port subsystem via the existing /init endpoint. - New PortSubsystem abstraction with Start/Stop/Restart lifecycle - Context-based goroutine cancellation (replaces irreversible channel close) - Context-aware Signal to prevent scanner/forwarder deadlock - Fix forwarder goroutine leak (was spinning forever on closed channel) - Kill socat children on stop to prevent orphans across snapshots - Fix double cmd.Wait panic (exec.Command instead of CommandContext)
71 lines
1.6 KiB
Go
71 lines
1.6 KiB
Go
// SPDX-License-Identifier: Apache-2.0
|
|
// Modifications by M/S Omukk
|
|
|
|
package port
|
|
|
|
import (
|
|
"context"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/rs/zerolog"
|
|
)
|
|
|
|
type Scanner struct {
|
|
period time.Duration
|
|
|
|
// Plain mutex-protected map instead of concurrent-map. The concurrent-map
|
|
// library's Items() spawns goroutines and uses a WaitGroup internally,
|
|
// which corrupts Go runtime semaphore state across Firecracker snapshot/restore.
|
|
mu sync.RWMutex
|
|
subs map[string]*ScannerSubscriber
|
|
}
|
|
|
|
func NewScanner(period time.Duration) *Scanner {
|
|
return &Scanner{
|
|
period: period,
|
|
subs: make(map[string]*ScannerSubscriber),
|
|
}
|
|
}
|
|
|
|
func (s *Scanner) AddSubscriber(logger *zerolog.Logger, id string, filter *ScannerFilter) *ScannerSubscriber {
|
|
subscriber := NewScannerSubscriber(logger, id, filter)
|
|
|
|
s.mu.Lock()
|
|
s.subs[id] = subscriber
|
|
s.mu.Unlock()
|
|
|
|
return subscriber
|
|
}
|
|
|
|
func (s *Scanner) Unsubscribe(sub *ScannerSubscriber) {
|
|
s.mu.Lock()
|
|
delete(s.subs, sub.ID())
|
|
s.mu.Unlock()
|
|
|
|
sub.Destroy()
|
|
}
|
|
|
|
// ScanAndBroadcast starts scanning open TCP ports and broadcasts every open port to all subscribers.
|
|
// It exits when ctx is cancelled.
|
|
func (s *Scanner) ScanAndBroadcast(ctx context.Context) {
|
|
for {
|
|
// Read directly from /proc/net/tcp and /proc/net/tcp6 instead of
|
|
// using gopsutil's net.Connections(), which walks /proc/{pid}/fd
|
|
// and causes Go runtime corruption after Firecracker snapshot/restore.
|
|
conns, _ := ReadTCPConnections()
|
|
|
|
s.mu.RLock()
|
|
for _, sub := range s.subs {
|
|
sub.Signal(ctx, conns)
|
|
}
|
|
s.mu.RUnlock()
|
|
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-time.After(s.period):
|
|
}
|
|
}
|
|
}
|