feat(vm): replace Firecracker with Cloud Hypervisor

Migrate the entire VM layer from Firecracker to Cloud Hypervisor (CH). CH provides native snapshot/restore via its HTTP API, eliminating the need for custom UFFD handling, memfile processing, and snapshot header management that Firecracker required. Key changes: - Remove fc.go, jailer.go (FC process management) - Remove internal/uffd/ package (userfaultfd lazy page loading) - Remove snapshot/header.go, mapping.go, memfile.go (FC snapshot format) - Add ch.go (CH HTTP API client over Unix socket) - Add process.go (CH process lifecycle with unshare+netns) - Add chversion.go (CH version detection) - Refactor sandbox manager: remove UFFD socket tracking, snapshot parent/diff chaining, FC-specific balloon logic; add crash watcher - Simplify snapshot/local.go to CH's native snapshot format - Update VM config: FirecrackerBin → VMMBin, new CH-specific fields - Update envdclient, devicemapper, network for CH compatibility
2026-05-17 01:33:12 +06:00
parent c2dc382787
commit eaa6b8576d
25 changed files with 754 additions and 2267 deletions
--- a/internal/api/host_monitor.go
+++ b/internal/api/host_monitor.go
@ -19,6 +19,12 @@ import (
 // it is considered unreachable (3 missed 30-second heartbeats).
 const unreachableThreshold = 90 * time.Second

+// transientGracePeriod is how long a sandbox is allowed to stay in a transient
+// status (starting, resuming, pausing, stopping) before the monitor infers a
+// final state. This prevents the monitor from racing against in-flight RPCs
+// that may not have registered the sandbox on the host agent yet.
+const transientGracePeriod = 2 * time.Minute
+
 // HostMonitor runs on a fixed interval and performs two duties:
 //
 //  1. Passive check: marks hosts whose last_heartbeat_at is stale as
@ -257,7 +263,16 @@ func (m *HostMonitor) checkHost(ctx context.Context, host db.Host) {
 			}
 			continue
 		}
-		// Sandbox is not alive on host — infer final state.
+		// Sandbox is not alive on host. If the transition is recent, give the
+		// in-flight RPC time to finish before declaring a final state.
+		if sb.LastUpdated.Valid && time.Since(sb.LastUpdated.Time) < transientGracePeriod {
+			slog.Debug("host monitor: transient sandbox still within grace period",
+				"sandbox_id", sbIDStr, "status", sb.Status,
+				"age", time.Since(sb.LastUpdated.Time).Round(time.Second))
+			continue
+		}
+
+		// Grace period expired — infer final state.
 		var finalStatus string
 		switch sb.Status {
 		case "starting", "resuming":
--- a/internal/api/sandbox_event_consumer.go
+++ b/internal/api/sandbox_event_consumer.go
@ -42,6 +42,7 @@ const (
 	SandboxEventResumed    = "sandbox.resumed"
 	SandboxEventStopped    = "sandbox.stopped"
 	SandboxEventFailed     = "sandbox.failed"
+	SandboxEventError      = "sandbox.error"
 	SandboxEventAutoPaused = "sandbox.auto_paused"
 )

@ -141,7 +142,7 @@ func (c *SandboxEventConsumer) handleMessage(ctx context.Context, msg redis.XMes
 		c.handlePaused(ctx, sandboxID, event)
 	case SandboxEventStopped:
 		c.handleStopped(ctx, sandboxID, event)
-	case SandboxEventFailed:
+	case SandboxEventFailed, SandboxEventError:
 		c.handleFailed(ctx, sandboxID)
 	case SandboxEventAutoPaused:
 		c.handleAutoPaused(ctx, sandboxID, event)
@ -187,20 +188,39 @@ func (c *SandboxEventConsumer) handlePaused(ctx context.Context, sandboxID pgtyp
 }

 func (c *SandboxEventConsumer) handleStopped(ctx context.Context, sandboxID pgtype.UUID, event SandboxEvent) {
+	// Try stopping → stopped (CP-initiated destroy completed).
 	if _, err := c.db.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{
 		ID:       sandboxID,
 		Status:   "stopping",
 		Status_2: "stopped",
+	}); err == nil {
+		return
+	}
+	// Try running → stopped (autonomous destroy, e.g. TTL auto-destroy).
+	if _, err := c.db.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{
+		ID:       sandboxID,
+		Status:   "running",
+		Status_2: "stopped",
 	}); err != nil && !errors.Is(err, pgx.ErrNoRows) {
 		slog.Warn("sandbox event consumer: failed to update sandbox to stopped", "sandbox_id", event.SandboxID, "error", err)
 	}
 }

-// handleFailed is a no-op fallback — the background goroutine already
-// performed the conditional DB update before publishing this event.
-// We keep the case arm so unknown event types are flagged, but avoid
-// an unconditional status write that could clobber concurrent operations.
-func (c *SandboxEventConsumer) handleFailed(_ context.Context, _ pgtype.UUID) {}
+// handleFailed marks a sandbox as "error" when the host agent reports a crash
+// or the CP's background goroutine publishes a failure. Uses conditional update
+// to avoid clobbering concurrent operations.
+func (c *SandboxEventConsumer) handleFailed(ctx context.Context, sandboxID pgtype.UUID) {
+	// Try running → error (VM crash pushed by host agent).
+	if _, err := c.db.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{
+		ID: sandboxID, Status: "running", Status_2: "error",
+	}); err == nil {
+		return
+	}
+	// Try starting → error (create failed).
+	_, _ = c.db.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{
+		ID: sandboxID, Status: "starting", Status_2: "error",
+	})
+}

 func (c *SandboxEventConsumer) handleAutoPaused(ctx context.Context, sandboxID pgtype.UUID, _ SandboxEvent) {
 	sb, err := c.db.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{