forked from wrenn/wrenn
feat(vm): replace Firecracker with Cloud Hypervisor
Migrate the entire VM layer from Firecracker to Cloud Hypervisor (CH). CH provides native snapshot/restore via its HTTP API, eliminating the need for custom UFFD handling, memfile processing, and snapshot header management that Firecracker required. Key changes: - Remove fc.go, jailer.go (FC process management) - Remove internal/uffd/ package (userfaultfd lazy page loading) - Remove snapshot/header.go, mapping.go, memfile.go (FC snapshot format) - Add ch.go (CH HTTP API client over Unix socket) - Add process.go (CH process lifecycle with unshare+netns) - Add chversion.go (CH version detection) - Refactor sandbox manager: remove UFFD socket tracking, snapshot parent/diff chaining, FC-specific balloon logic; add crash watcher - Simplify snapshot/local.go to CH's native snapshot format - Update VM config: FirecrackerBin → VMMBin, new CH-specific fields - Update envdclient, devicemapper, network for CH compatibility
This commit is contained in:
@ -19,6 +19,12 @@ import (
|
||||
// it is considered unreachable (3 missed 30-second heartbeats).
|
||||
const unreachableThreshold = 90 * time.Second
|
||||
|
||||
// transientGracePeriod is how long a sandbox is allowed to stay in a transient
|
||||
// status (starting, resuming, pausing, stopping) before the monitor infers a
|
||||
// final state. This prevents the monitor from racing against in-flight RPCs
|
||||
// that may not have registered the sandbox on the host agent yet.
|
||||
const transientGracePeriod = 2 * time.Minute
|
||||
|
||||
// HostMonitor runs on a fixed interval and performs two duties:
|
||||
//
|
||||
// 1. Passive check: marks hosts whose last_heartbeat_at is stale as
|
||||
@ -257,7 +263,16 @@ func (m *HostMonitor) checkHost(ctx context.Context, host db.Host) {
|
||||
}
|
||||
continue
|
||||
}
|
||||
// Sandbox is not alive on host — infer final state.
|
||||
// Sandbox is not alive on host. If the transition is recent, give the
|
||||
// in-flight RPC time to finish before declaring a final state.
|
||||
if sb.LastUpdated.Valid && time.Since(sb.LastUpdated.Time) < transientGracePeriod {
|
||||
slog.Debug("host monitor: transient sandbox still within grace period",
|
||||
"sandbox_id", sbIDStr, "status", sb.Status,
|
||||
"age", time.Since(sb.LastUpdated.Time).Round(time.Second))
|
||||
continue
|
||||
}
|
||||
|
||||
// Grace period expired — infer final state.
|
||||
var finalStatus string
|
||||
switch sb.Status {
|
||||
case "starting", "resuming":
|
||||
|
||||
@ -42,6 +42,7 @@ const (
|
||||
SandboxEventResumed = "sandbox.resumed"
|
||||
SandboxEventStopped = "sandbox.stopped"
|
||||
SandboxEventFailed = "sandbox.failed"
|
||||
SandboxEventError = "sandbox.error"
|
||||
SandboxEventAutoPaused = "sandbox.auto_paused"
|
||||
)
|
||||
|
||||
@ -141,7 +142,7 @@ func (c *SandboxEventConsumer) handleMessage(ctx context.Context, msg redis.XMes
|
||||
c.handlePaused(ctx, sandboxID, event)
|
||||
case SandboxEventStopped:
|
||||
c.handleStopped(ctx, sandboxID, event)
|
||||
case SandboxEventFailed:
|
||||
case SandboxEventFailed, SandboxEventError:
|
||||
c.handleFailed(ctx, sandboxID)
|
||||
case SandboxEventAutoPaused:
|
||||
c.handleAutoPaused(ctx, sandboxID, event)
|
||||
@ -187,20 +188,39 @@ func (c *SandboxEventConsumer) handlePaused(ctx context.Context, sandboxID pgtyp
|
||||
}
|
||||
|
||||
func (c *SandboxEventConsumer) handleStopped(ctx context.Context, sandboxID pgtype.UUID, event SandboxEvent) {
|
||||
// Try stopping → stopped (CP-initiated destroy completed).
|
||||
if _, err := c.db.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{
|
||||
ID: sandboxID,
|
||||
Status: "stopping",
|
||||
Status_2: "stopped",
|
||||
}); err == nil {
|
||||
return
|
||||
}
|
||||
// Try running → stopped (autonomous destroy, e.g. TTL auto-destroy).
|
||||
if _, err := c.db.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{
|
||||
ID: sandboxID,
|
||||
Status: "running",
|
||||
Status_2: "stopped",
|
||||
}); err != nil && !errors.Is(err, pgx.ErrNoRows) {
|
||||
slog.Warn("sandbox event consumer: failed to update sandbox to stopped", "sandbox_id", event.SandboxID, "error", err)
|
||||
}
|
||||
}
|
||||
|
||||
// handleFailed is a no-op fallback — the background goroutine already
|
||||
// performed the conditional DB update before publishing this event.
|
||||
// We keep the case arm so unknown event types are flagged, but avoid
|
||||
// an unconditional status write that could clobber concurrent operations.
|
||||
func (c *SandboxEventConsumer) handleFailed(_ context.Context, _ pgtype.UUID) {}
|
||||
// handleFailed marks a sandbox as "error" when the host agent reports a crash
|
||||
// or the CP's background goroutine publishes a failure. Uses conditional update
|
||||
// to avoid clobbering concurrent operations.
|
||||
func (c *SandboxEventConsumer) handleFailed(ctx context.Context, sandboxID pgtype.UUID) {
|
||||
// Try running → error (VM crash pushed by host agent).
|
||||
if _, err := c.db.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{
|
||||
ID: sandboxID, Status: "running", Status_2: "error",
|
||||
}); err == nil {
|
||||
return
|
||||
}
|
||||
// Try starting → error (create failed).
|
||||
_, _ = c.db.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{
|
||||
ID: sandboxID, Status: "starting", Status_2: "error",
|
||||
})
|
||||
}
|
||||
|
||||
func (c *SandboxEventConsumer) handleAutoPaused(ctx context.Context, sandboxID pgtype.UUID, _ SandboxEvent) {
|
||||
sb, err := c.db.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{
|
||||
|
||||
Reference in New Issue
Block a user