1
0
forked from wrenn/wrenn
Files
wrenn-releases/envd/internal/api/store.go
pptx704 962860ba74 Pre-pause snapshot signal to prevent Go runtime crash on restore
envd crashes with "fatal error: bad summary data" after Firecracker
snapshot/restore because the page allocator radix tree is inconsistent
when vCPUs are frozen mid-allocation. The port scanner goroutine
allocates heavily every second, making it the primary trigger.

Add POST /snapshot/prepare to envd — the host agent calls it before
vm.Pause to quiesce continuous goroutines and force GC. On restore,
PostInit restarts the port subsystem via the existing /init endpoint.

- New PortSubsystem abstraction with Start/Stop/Restart lifecycle
- Context-based goroutine cancellation (replaces irreversible channel close)
- Context-aware Signal to prevent scanner/forwarder deadlock
- Fix forwarder goroutine leak (was spinning forever on closed channel)
- Kill socat children on stop to prevent orphans across snapshots
- Fix double cmd.Wait panic (exec.Command instead of CommandContext)
2026-04-13 05:21:10 +06:00

105 lines
2.7 KiB
Go

// SPDX-License-Identifier: Apache-2.0
// Modifications by M/S Omukk
package api
import (
"context"
"encoding/json"
"net/http"
"sync"
"github.com/rs/zerolog"
"git.omukk.dev/wrenn/sandbox/envd/internal/execcontext"
"git.omukk.dev/wrenn/sandbox/envd/internal/host"
publicport "git.omukk.dev/wrenn/sandbox/envd/internal/port"
"git.omukk.dev/wrenn/sandbox/envd/internal/utils"
)
// MMDSClient provides access to MMDS metadata.
type MMDSClient interface {
GetAccessTokenHash(ctx context.Context) (string, error)
}
// DefaultMMDSClient is the production implementation that calls the real MMDS endpoint.
type DefaultMMDSClient struct{}
func (c *DefaultMMDSClient) GetAccessTokenHash(ctx context.Context) (string, error) {
return host.GetAccessTokenHashFromMMDS(ctx)
}
type API struct {
isNotFC bool
logger *zerolog.Logger
accessToken *SecureToken
defaults *execcontext.Defaults
mmdsChan chan *host.MMDSOpts
hyperloopLock sync.Mutex
mmdsClient MMDSClient
lastSetTime *utils.AtomicMax
initLock sync.Mutex
// rootCtx is the parent context from main(), used to restart
// long-lived goroutines after snapshot restore.
rootCtx context.Context
portSubsystem *publicport.PortSubsystem
}
func New(l *zerolog.Logger, defaults *execcontext.Defaults, mmdsChan chan *host.MMDSOpts, isNotFC bool, rootCtx context.Context, portSubsystem *publicport.PortSubsystem) *API {
return &API{
logger: l,
defaults: defaults,
mmdsChan: mmdsChan,
isNotFC: isNotFC,
mmdsClient: &DefaultMMDSClient{},
lastSetTime: utils.NewAtomicMax(),
accessToken: &SecureToken{},
rootCtx: rootCtx,
portSubsystem: portSubsystem,
}
}
func (a *API) GetHealth(w http.ResponseWriter, r *http.Request) {
defer r.Body.Close()
a.logger.Trace().Msg("Health check")
w.Header().Set("Cache-Control", "no-store")
w.Header().Set("Content-Type", "")
w.WriteHeader(http.StatusNoContent)
}
func (a *API) GetMetrics(w http.ResponseWriter, r *http.Request) {
defer r.Body.Close()
a.logger.Trace().Msg("Get metrics")
w.Header().Set("Cache-Control", "no-store")
w.Header().Set("Content-Type", "application/json")
metrics, err := host.GetMetrics()
if err != nil {
a.logger.Error().Err(err).Msg("Failed to get metrics")
w.WriteHeader(http.StatusInternalServerError)
return
}
w.WriteHeader(http.StatusOK)
if err := json.NewEncoder(w).Encode(metrics); err != nil {
a.logger.Error().Err(err).Msg("Failed to encode metrics")
}
}
func (a *API) getLogger(err error) *zerolog.Event {
if err != nil {
return a.logger.Error().Err(err) //nolint:zerologlint // this is only prep
}
return a.logger.Info() //nolint:zerologlint // this is only prep
}