forked from wrenn/wrenn
refactor: remove Go envd module, update host agent for Rust envd
The Go envd guest agent (`envd/`) is fully replaced by the Rust implementation (`envd-rs/`). This commit removes the Go module and updates all references across the codebase. Makefile: remove ENVD_DIR, VERSION_ENVD, build-envd-go, dev-envd-go, and Go envd from proto/fmt/vet/tidy/clean targets. Add static-link verification to build-envd. Host agent: rewrite snapshot quiesce comments that referenced Go GC and page allocator corruption — no longer applicable with Rust envd. Tighten envdclient to expect HTTP 200 (not 204) from health and file upload endpoints, and require JSON version response from FetchVersion. Remove NOTICE (no e2b-derived code remains). Update CLAUDE.md and README.md to reflect Rust envd architecture.
This commit is contained in:
@ -250,7 +250,7 @@ func (c *Client) WriteFile(ctx context.Context, path string, content []byte) err
|
||||
|
||||
respBody, _ := io.ReadAll(resp.Body)
|
||||
|
||||
if resp.StatusCode != http.StatusOK && resp.StatusCode != http.StatusNoContent {
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return fmt.Errorf("write file %s: status %d: %s", path, resp.StatusCode, string(respBody))
|
||||
}
|
||||
|
||||
@ -292,10 +292,9 @@ func (c *Client) ReadFile(ctx context.Context, path string) ([]byte, error) {
|
||||
return data, nil
|
||||
}
|
||||
|
||||
// PrepareSnapshot calls envd's POST /snapshot/prepare endpoint, which quiesces
|
||||
// continuous goroutines (port scanner, forwarder) and forces a GC cycle before
|
||||
// Firecracker takes a VM snapshot. This ensures the Go runtime's page allocator
|
||||
// is in a consistent state when vCPUs are frozen.
|
||||
// PrepareSnapshot calls envd's POST /snapshot/prepare endpoint, which stops
|
||||
// the port scanner/forwarder and marks active connections for post-restore
|
||||
// cleanup before Firecracker freezes vCPUs.
|
||||
//
|
||||
// Best-effort: the caller should log a warning on error but not abort the pause.
|
||||
func (c *Client) PrepareSnapshot(ctx context.Context) error {
|
||||
|
||||
@ -4,7 +4,6 @@ import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
"net/http"
|
||||
"time"
|
||||
@ -46,20 +45,15 @@ func (c *Client) FetchVersion(ctx context.Context) (string, error) {
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK && resp.StatusCode != http.StatusNoContent {
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return "", fmt.Errorf("health check returned %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil || len(body) == 0 {
|
||||
return "", nil // envd may not support version reporting yet
|
||||
}
|
||||
|
||||
var data struct {
|
||||
Version string `json:"version"`
|
||||
}
|
||||
if err := json.Unmarshal(body, &data); err != nil {
|
||||
return "", nil // non-JSON response, old envd
|
||||
if err := json.NewDecoder(resp.Body).Decode(&data); err != nil {
|
||||
return "", fmt.Errorf("decode version response: %w", err)
|
||||
}
|
||||
|
||||
return data.Version, nil
|
||||
@ -78,7 +72,7 @@ func (c *Client) healthCheck(ctx context.Context) error {
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK && resp.StatusCode != http.StatusNoContent {
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return fmt.Errorf("health check returned %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
|
||||
@ -95,11 +95,9 @@ type snapshotParent struct {
|
||||
}
|
||||
|
||||
// maxDiffGenerations caps how many incremental diff generations we chain
|
||||
// before falling back to a Full snapshot to collapse the chain. Firecracker
|
||||
// snapshot/restore of a Go process (envd) accumulates runtime memory state
|
||||
// drift; empirically, ~10 diff-based cycles corrupt the Go page allocator.
|
||||
// A Full snapshot resets the generation counter and produces a clean base,
|
||||
// preventing the crash.
|
||||
// before falling back to a Full snapshot to collapse the chain. Long diff
|
||||
// chains increase restore latency and snapshot directory size; a periodic
|
||||
// Full snapshot resets the counter and produces a clean base.
|
||||
const maxDiffGenerations = 8
|
||||
|
||||
// buildMetadata constructs the metadata map with version information.
|
||||
@ -382,8 +380,7 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error {
|
||||
m.stopSampler(sb)
|
||||
|
||||
// Step 0: Drain in-flight proxy connections before freezing vCPUs.
|
||||
// This prevents Go runtime corruption inside the guest caused by stale
|
||||
// TCP state from connections that were alive when the VM was snapshotted.
|
||||
// Stale TCP state from mid-flight connections causes issues on restore.
|
||||
sb.connTracker.Drain(2 * time.Second)
|
||||
slog.Debug("pause: proxy connections drained", "id", sandboxID)
|
||||
|
||||
@ -393,10 +390,8 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error {
|
||||
sb.client.CloseIdleConnections()
|
||||
slog.Debug("pause: envd client idle connections closed", "id", sandboxID)
|
||||
|
||||
// Step 0c: Signal envd to quiesce continuous goroutines (port scanner,
|
||||
// forwarder), close idle HTTP connections, and run GC before freezing
|
||||
// vCPUs. This prevents Go runtime page allocator corruption ("bad
|
||||
// summary data") on snapshot restore. The 3s timeout also gives time
|
||||
// Step 0c: Signal envd to quiesce (stop port scanner/forwarder, mark
|
||||
// connections for post-restore cleanup). The 3s timeout also gives time
|
||||
// for the FINs from Step 0b to be processed by the guest kernel.
|
||||
// Best-effort: a failure is logged but does not abort the pause.
|
||||
func() {
|
||||
@ -405,7 +400,7 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error {
|
||||
if err := sb.client.PrepareSnapshot(prepCtx); err != nil {
|
||||
slog.Warn("pause: pre-snapshot quiesce failed (best-effort)", "id", sandboxID, "error", err)
|
||||
} else {
|
||||
slog.Debug("pause: envd goroutines quiesced", "id", sandboxID)
|
||||
slog.Debug("pause: envd quiesced", "id", sandboxID)
|
||||
}
|
||||
}()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user