1
0
forked from wrenn/wrenn

feat(vm): replace Firecracker with Cloud Hypervisor

Migrate the entire VM layer from Firecracker to Cloud Hypervisor (CH).
CH provides native snapshot/restore via its HTTP API, eliminating the
need for custom UFFD handling, memfile processing, and snapshot header
management that Firecracker required.

Key changes:
- Remove fc.go, jailer.go (FC process management)
- Remove internal/uffd/ package (userfaultfd lazy page loading)
- Remove snapshot/header.go, mapping.go, memfile.go (FC snapshot format)
- Add ch.go (CH HTTP API client over Unix socket)
- Add process.go (CH process lifecycle with unshare+netns)
- Add chversion.go (CH version detection)
- Refactor sandbox manager: remove UFFD socket tracking, snapshot
  parent/diff chaining, FC-specific balloon logic; add crash watcher
- Simplify snapshot/local.go to CH's native snapshot format
- Update VM config: FirecrackerBin → VMMBin, new CH-specific fields
- Update envdclient, devicemapper, network for CH compatibility
This commit is contained in:
2026-05-17 01:33:12 +06:00
parent c2dc382787
commit eaa6b8576d
25 changed files with 754 additions and 2267 deletions

View File

@ -19,6 +19,12 @@ import (
// it is considered unreachable (3 missed 30-second heartbeats).
const unreachableThreshold = 90 * time.Second
// transientGracePeriod is how long a sandbox is allowed to stay in a transient
// status (starting, resuming, pausing, stopping) before the monitor infers a
// final state. This prevents the monitor from racing against in-flight RPCs
// that may not have registered the sandbox on the host agent yet.
const transientGracePeriod = 2 * time.Minute
// HostMonitor runs on a fixed interval and performs two duties:
//
// 1. Passive check: marks hosts whose last_heartbeat_at is stale as
@ -257,7 +263,16 @@ func (m *HostMonitor) checkHost(ctx context.Context, host db.Host) {
}
continue
}
// Sandbox is not alive on host — infer final state.
// Sandbox is not alive on host. If the transition is recent, give the
// in-flight RPC time to finish before declaring a final state.
if sb.LastUpdated.Valid && time.Since(sb.LastUpdated.Time) < transientGracePeriod {
slog.Debug("host monitor: transient sandbox still within grace period",
"sandbox_id", sbIDStr, "status", sb.Status,
"age", time.Since(sb.LastUpdated.Time).Round(time.Second))
continue
}
// Grace period expired — infer final state.
var finalStatus string
switch sb.Status {
case "starting", "resuming":

View File

@ -42,6 +42,7 @@ const (
SandboxEventResumed = "sandbox.resumed"
SandboxEventStopped = "sandbox.stopped"
SandboxEventFailed = "sandbox.failed"
SandboxEventError = "sandbox.error"
SandboxEventAutoPaused = "sandbox.auto_paused"
)
@ -141,7 +142,7 @@ func (c *SandboxEventConsumer) handleMessage(ctx context.Context, msg redis.XMes
c.handlePaused(ctx, sandboxID, event)
case SandboxEventStopped:
c.handleStopped(ctx, sandboxID, event)
case SandboxEventFailed:
case SandboxEventFailed, SandboxEventError:
c.handleFailed(ctx, sandboxID)
case SandboxEventAutoPaused:
c.handleAutoPaused(ctx, sandboxID, event)
@ -187,20 +188,39 @@ func (c *SandboxEventConsumer) handlePaused(ctx context.Context, sandboxID pgtyp
}
func (c *SandboxEventConsumer) handleStopped(ctx context.Context, sandboxID pgtype.UUID, event SandboxEvent) {
// Try stopping → stopped (CP-initiated destroy completed).
if _, err := c.db.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{
ID: sandboxID,
Status: "stopping",
Status_2: "stopped",
}); err == nil {
return
}
// Try running → stopped (autonomous destroy, e.g. TTL auto-destroy).
if _, err := c.db.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{
ID: sandboxID,
Status: "running",
Status_2: "stopped",
}); err != nil && !errors.Is(err, pgx.ErrNoRows) {
slog.Warn("sandbox event consumer: failed to update sandbox to stopped", "sandbox_id", event.SandboxID, "error", err)
}
}
// handleFailed is a no-op fallback — the background goroutine already
// performed the conditional DB update before publishing this event.
// We keep the case arm so unknown event types are flagged, but avoid
// an unconditional status write that could clobber concurrent operations.
func (c *SandboxEventConsumer) handleFailed(_ context.Context, _ pgtype.UUID) {}
// handleFailed marks a sandbox as "error" when the host agent reports a crash
// or the CP's background goroutine publishes a failure. Uses conditional update
// to avoid clobbering concurrent operations.
func (c *SandboxEventConsumer) handleFailed(ctx context.Context, sandboxID pgtype.UUID) {
// Try running → error (VM crash pushed by host agent).
if _, err := c.db.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{
ID: sandboxID, Status: "running", Status_2: "error",
}); err == nil {
return
}
// Try starting → error (create failed).
_, _ = c.db.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{
ID: sandboxID, Status: "starting", Status_2: "error",
})
}
func (c *SandboxEventConsumer) handleAutoPaused(ctx context.Context, sandboxID pgtype.UUID, _ SandboxEvent) {
sb, err := c.db.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{