1
0
forked from wrenn/wrenn

feat(vm): replace Firecracker with Cloud Hypervisor

Migrate the entire VM layer from Firecracker to Cloud Hypervisor (CH).
CH provides native snapshot/restore via its HTTP API, eliminating the
need for custom UFFD handling, memfile processing, and snapshot header
management that Firecracker required.

Key changes:
- Remove fc.go, jailer.go (FC process management)
- Remove internal/uffd/ package (userfaultfd lazy page loading)
- Remove snapshot/header.go, mapping.go, memfile.go (FC snapshot format)
- Add ch.go (CH HTTP API client over Unix socket)
- Add process.go (CH process lifecycle with unshare+netns)
- Add chversion.go (CH version detection)
- Refactor sandbox manager: remove UFFD socket tracking, snapshot
  parent/diff chaining, FC-specific balloon logic; add crash watcher
- Simplify snapshot/local.go to CH's native snapshot format
- Update VM config: FirecrackerBin → VMMBin, new CH-specific fields
- Update envdclient, devicemapper, network for CH compatibility
This commit is contained in:
2026-05-17 01:33:12 +06:00
parent c2dc382787
commit eaa6b8576d
25 changed files with 754 additions and 2267 deletions

View File

@ -19,6 +19,12 @@ import (
// it is considered unreachable (3 missed 30-second heartbeats).
const unreachableThreshold = 90 * time.Second
// transientGracePeriod is how long a sandbox is allowed to stay in a transient
// status (starting, resuming, pausing, stopping) before the monitor infers a
// final state. This prevents the monitor from racing against in-flight RPCs
// that may not have registered the sandbox on the host agent yet.
const transientGracePeriod = 2 * time.Minute
// HostMonitor runs on a fixed interval and performs two duties:
//
// 1. Passive check: marks hosts whose last_heartbeat_at is stale as
@ -257,7 +263,16 @@ func (m *HostMonitor) checkHost(ctx context.Context, host db.Host) {
}
continue
}
// Sandbox is not alive on host — infer final state.
// Sandbox is not alive on host. If the transition is recent, give the
// in-flight RPC time to finish before declaring a final state.
if sb.LastUpdated.Valid && time.Since(sb.LastUpdated.Time) < transientGracePeriod {
slog.Debug("host monitor: transient sandbox still within grace period",
"sandbox_id", sbIDStr, "status", sb.Status,
"age", time.Since(sb.LastUpdated.Time).Round(time.Second))
continue
}
// Grace period expired — infer final state.
var finalStatus string
switch sb.Status {
case "starting", "resuming":