1
0
forked from wrenn/wrenn

Pre-pause snapshot signal to prevent Go runtime crash on restore

envd crashes with "fatal error: bad summary data" after Firecracker
snapshot/restore because the page allocator radix tree is inconsistent
when vCPUs are frozen mid-allocation. The port scanner goroutine
allocates heavily every second, making it the primary trigger.

Add POST /snapshot/prepare to envd — the host agent calls it before
vm.Pause to quiesce continuous goroutines and force GC. On restore,
PostInit restarts the port subsystem via the existing /init endpoint.

- New PortSubsystem abstraction with Start/Stop/Restart lifecycle
- Context-based goroutine cancellation (replaces irreversible channel close)
- Context-aware Signal to prevent scanner/forwarder deadlock
- Fix forwarder goroutine leak (was spinning forever on closed channel)
- Kill socat children on stop to prevent orphans across snapshots
- Fix double cmd.Wait panic (exec.Command instead of CommandContext)
This commit is contained in:
2026-04-13 05:21:10 +06:00
parent 117c46a386
commit 962860ba74
15 changed files with 317 additions and 59 deletions

View File

@ -1,4 +1,5 @@
// SPDX-License-Identifier: Apache-2.0
// Modifications by M/S Omukk
package api
@ -12,6 +13,7 @@ import (
"git.omukk.dev/wrenn/sandbox/envd/internal/execcontext"
"git.omukk.dev/wrenn/sandbox/envd/internal/host"
publicport "git.omukk.dev/wrenn/sandbox/envd/internal/port"
"git.omukk.dev/wrenn/sandbox/envd/internal/utils"
)
@ -39,17 +41,24 @@ type API struct {
lastSetTime *utils.AtomicMax
initLock sync.Mutex
// rootCtx is the parent context from main(), used to restart
// long-lived goroutines after snapshot restore.
rootCtx context.Context
portSubsystem *publicport.PortSubsystem
}
func New(l *zerolog.Logger, defaults *execcontext.Defaults, mmdsChan chan *host.MMDSOpts, isNotFC bool) *API {
func New(l *zerolog.Logger, defaults *execcontext.Defaults, mmdsChan chan *host.MMDSOpts, isNotFC bool, rootCtx context.Context, portSubsystem *publicport.PortSubsystem) *API {
return &API{
logger: l,
defaults: defaults,
mmdsChan: mmdsChan,
isNotFC: isNotFC,
mmdsClient: &DefaultMMDSClient{},
lastSetTime: utils.NewAtomicMax(),
accessToken: &SecureToken{},
logger: l,
defaults: defaults,
mmdsChan: mmdsChan,
isNotFC: isNotFC,
mmdsClient: &DefaultMMDSClient{},
lastSetTime: utils.NewAtomicMax(),
accessToken: &SecureToken{},
rootCtx: rootCtx,
portSubsystem: portSubsystem,
}
}