1
0
forked from wrenn/wrenn

fix: resolve bugs and DRY violations in sandbox manager and API handlers

- Fix createFromSnapshot discarding memoryMB param (balloon optimization was dead)
- Fix double dm-snapshot removal in Pause() cleanupPauseFailure path
- Fix DestroySandbox RPC mapping all errors to CodeNotFound
- Fix handleFailed event consumer missing pausing/resuming → error transitions
- Fix stream resource leak in StreamUpload on early-return paths
- Add envs/cwd fields to ExecRequest proto for foreground exec parity
- Extract createResources rollback helper to eliminate 4x duplicated teardown
- Remove unused chClient.ping method
- Add .mcp.json to gitignore

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-17 02:30:32 +06:00
parent 74f85ce4e9
commit 62bede5dae
10 changed files with 272 additions and 181 deletions

View File

@ -119,6 +119,8 @@ func (h *execHandler) Exec(w http.ResponseWriter, r *http.Request) {
Cmd: req.Cmd,
Args: req.Args,
TimeoutSec: req.TimeoutSec,
Envs: req.Envs,
Cwd: req.Cwd,
}))
if err != nil {
status, code, msg := agentErrToHTTP(err)

View File

@ -89,6 +89,12 @@ func (h *filesStreamHandler) StreamUpload(w http.ResponseWriter, r *http.Request
// Open client-streaming RPC to host agent.
stream := agent.WriteFileStream(ctx)
var streamClosed bool
defer func() {
if !streamClosed {
stream.CloseAndReceive()
}
}()
// Send metadata first.
if err := stream.Send(&pb.WriteFileStreamRequest{
@ -127,6 +133,7 @@ func (h *filesStreamHandler) StreamUpload(w http.ResponseWriter, r *http.Request
}
// Close and receive response.
streamClosed = true
if _, err := stream.CloseAndReceive(); err != nil {
status, code, msg := agentErrToHTTP(err)
writeError(w, status, code, msg)

View File

@ -210,16 +210,14 @@ func (c *SandboxEventConsumer) handleStopped(ctx context.Context, sandboxID pgty
// or the CP's background goroutine publishes a failure. Uses conditional update
// to avoid clobbering concurrent operations.
func (c *SandboxEventConsumer) handleFailed(ctx context.Context, sandboxID pgtype.UUID) {
// Try running → error (VM crash pushed by host agent).
if _, err := c.db.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{
ID: sandboxID, Status: "running", Status_2: "error",
}); err == nil {
return
// Try each possible pre-failure state until one matches.
for _, fromStatus := range []string{"running", "starting", "pausing", "resuming"} {
if _, err := c.db.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{
ID: sandboxID, Status: fromStatus, Status_2: "error",
}); err == nil {
return
}
}
// Try starting → error (create failed).
_, _ = c.db.UpdateSandboxStatusIf(ctx, db.UpdateSandboxStatusIfParams{
ID: sandboxID, Status: "starting", Status_2: "error",
})
}
func (c *SandboxEventConsumer) handleAutoPaused(ctx context.Context, sandboxID pgtype.UUID, _ SandboxEvent) {

View File

@ -78,15 +78,30 @@ type ExecResult struct {
ExitCode int32
}
// ExecOpts holds optional parameters for Exec.
type ExecOpts struct {
Envs map[string]string
Cwd string
}
// Exec runs a command inside the sandbox and collects all stdout/stderr output.
// It blocks until the command completes.
func (c *Client) Exec(ctx context.Context, cmd string, args ...string) (*ExecResult, error) {
func (c *Client) Exec(ctx context.Context, cmd string, args []string, opts *ExecOpts) (*ExecResult, error) {
stdin := false
proc := &envdpb.ProcessConfig{
Cmd: cmd,
Args: args,
}
if opts != nil {
if len(opts.Envs) > 0 {
proc.Envs = opts.Envs
}
if opts.Cwd != "" {
proc.Cwd = &opts.Cwd
}
}
req := connect.NewRequest(&envdpb.StartRequest{
Process: &envdpb.ProcessConfig{
Cmd: cmd,
Args: args,
},
Process: proc,
Stdin: &stdin,
})

View File

@ -20,6 +20,7 @@ import (
pb "git.omukk.dev/wrenn/wrenn/proto/hostagent/gen"
"git.omukk.dev/wrenn/wrenn/proto/hostagent/gen/hostagentv1connect"
"git.omukk.dev/wrenn/wrenn/internal/envdclient"
"git.omukk.dev/wrenn/wrenn/internal/sandbox"
)
@ -90,7 +91,10 @@ func (s *Server) DestroySandbox(
req *connect.Request[pb.DestroySandboxRequest],
) (*connect.Response[pb.DestroySandboxResponse], error) {
if err := s.mgr.Destroy(ctx, req.Msg.SandboxId); err != nil {
return nil, connect.NewError(connect.CodeNotFound, err)
if strings.Contains(err.Error(), "not found") {
return nil, connect.NewError(connect.CodeNotFound, err)
}
return nil, connect.NewError(connect.CodeInternal, err)
}
return connect.NewResponse(&pb.DestroySandboxResponse{}), nil
}
@ -216,7 +220,12 @@ func (s *Server) Exec(
execCtx, cancel := context.WithTimeout(ctx, timeout)
defer cancel()
result, err := s.mgr.Exec(execCtx, msg.SandboxId, msg.Cmd, msg.Args...)
var opts *envdclient.ExecOpts
if len(msg.Envs) > 0 || msg.Cwd != "" {
opts = &envdclient.ExecOpts{Envs: msg.Envs, Cwd: msg.Cwd}
}
result, err := s.mgr.Exec(execCtx, msg.SandboxId, msg.Cmd, msg.Args, opts)
if err != nil {
return nil, connect.NewError(connect.CodeInternal, fmt.Errorf("exec: %w", err))
}

View File

@ -201,24 +201,30 @@ func (m *Manager) Create(ctx context.Context, sandboxID string, teamID, template
return nil, fmt.Errorf("create dm-snapshot: %w", err)
}
res := &createResources{
sandboxID: sandboxID,
loops: m.loops,
loopImage: baseRootfs,
dmDevice: dmDev,
cowPath: cowPath,
slots: m.slots,
}
// Allocate network slot.
slotIdx, err := m.slots.Allocate()
if err != nil {
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev))
os.Remove(cowPath)
m.loops.Release(baseRootfs)
res.rollback()
return nil, fmt.Errorf("allocate network slot: %w", err)
}
res.slotIdx = slotIdx
slot := network.NewSlot(slotIdx)
// Set up network.
if err := network.CreateNetwork(slot); err != nil {
m.slots.Release(slotIdx)
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev))
os.Remove(cowPath)
m.loops.Release(baseRootfs)
res.rollback()
return nil, fmt.Errorf("create network: %w", err)
}
res.slot = slot
// Boot VM — CH gets the dm device path.
vmCfg := vm.VMConfig{
@ -238,13 +244,10 @@ func (m *Manager) Create(ctx context.Context, sandboxID string, teamID, template
}
if _, err := m.vm.Create(ctx, vmCfg); err != nil {
warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot))
m.slots.Release(slotIdx)
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev))
os.Remove(cowPath)
m.loops.Release(baseRootfs)
res.rollback()
return nil, fmt.Errorf("create VM: %w", err)
}
res.vm = m.vm
// Wait for envd to be ready.
client := envdclient.New(slot.HostIP.String())
@ -252,12 +255,7 @@ func (m *Manager) Create(ctx context.Context, sandboxID string, teamID, template
defer waitCancel()
if err := client.WaitUntilReady(waitCtx); err != nil {
warnErr("vm destroy error", sandboxID, m.vm.Destroy(context.Background(), sandboxID))
warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot))
m.slots.Release(slotIdx)
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev))
os.Remove(cowPath)
m.loops.Release(baseRootfs)
res.rollback()
return nil, fmt.Errorf("wait for envd: %w", err)
}
@ -538,14 +536,16 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error {
// ── Step 7: Remove dm-snapshot and save CoW ──────────────────────
if sb.dmDevice != nil {
if err := devicemapper.RemoveSnapshot(ctx, sb.dmDevice); err != nil {
dmDev := sb.dmDevice
sb.dmDevice = nil
if err := devicemapper.RemoveSnapshot(ctx, dmDev); err != nil {
m.cleanupPauseFailure(sb, sandboxID, pauseDir)
return fmt.Errorf("remove dm-snapshot: %w", err)
}
snapshotCow := snapshot.CowPath(pauseDir, "")
if err := os.Rename(sb.dmDevice.CowPath, snapshotCow); err != nil {
os.Remove(sb.dmDevice.CowPath)
if err := os.Rename(dmDev.CowPath, snapshotCow); err != nil {
os.Remove(dmDev.CowPath)
m.cleanupPauseFailure(sb, sandboxID, pauseDir)
return fmt.Errorf("move cow file: %w", err)
}
@ -616,38 +616,42 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string, timeoutSec int,
return nil, fmt.Errorf("move cow file: %w", err)
}
rollbackCow := func() {
if err := os.Rename(cowPath, savedCow); err != nil {
slog.Warn("failed to rollback cow file", "src", cowPath, "dst", savedCow, "error", err)
}
}
// Restore dm-snapshot from existing persistent CoW file.
dmName := "wrenn-" + sandboxID
dmDev, err := devicemapper.RestoreSnapshot(ctx, dmName, originLoop, cowPath, originSize)
if err != nil {
m.loops.Release(baseImagePath)
rollbackCow()
os.Rename(cowPath, savedCow)
return nil, fmt.Errorf("restore dm-snapshot: %w", err)
}
res := &createResources{
sandboxID: sandboxID,
loops: m.loops,
loopImage: baseImagePath,
dmDevice: dmDev,
slots: m.slots,
rollCow: func() {
if err := os.Rename(cowPath, savedCow); err != nil {
slog.Warn("failed to rollback cow file", "src", cowPath, "dst", savedCow, "error", err)
}
},
}
// Allocate network slot.
slotIdx, err := m.slots.Allocate()
if err != nil {
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev))
rollbackCow()
m.loops.Release(baseImagePath)
res.rollback()
return nil, fmt.Errorf("allocate network slot: %w", err)
}
res.slotIdx = slotIdx
slot := network.NewSlot(slotIdx)
if err := network.CreateNetwork(slot); err != nil {
m.slots.Release(slotIdx)
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev))
rollbackCow()
m.loops.Release(baseImagePath)
res.rollback()
return nil, fmt.Errorf("create network: %w", err)
}
res.slot = slot
// Restore VM from CH snapshot.
vmCfg := vm.VMConfig{
@ -655,6 +659,7 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string, timeoutSec int,
TemplateID: meta.TemplateID,
KernelPath: m.resolveKernelPath(kernelVersion),
RootfsPath: dmDev.DevicePath,
MemoryMB: meta.MemoryMB,
NetworkNamespace: slot.NamespaceID,
TapDevice: slot.TapName,
TapMAC: slot.TapMAC,
@ -665,13 +670,10 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string, timeoutSec int,
}
if _, err := m.vm.CreateFromSnapshot(ctx, vmCfg, pauseDir); err != nil {
warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot))
m.slots.Release(slotIdx)
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev))
rollbackCow()
m.loops.Release(baseImagePath)
res.rollback()
return nil, fmt.Errorf("restore VM from snapshot: %w", err)
}
res.vm = m.vm
// Wait for envd to be ready.
client := envdclient.New(slot.HostIP.String())
@ -679,12 +681,7 @@ func (m *Manager) Resume(ctx context.Context, sandboxID string, timeoutSec int,
if err := client.WaitUntilReady(waitCtx); err != nil {
waitCancel()
warnErr("vm destroy error", sandboxID, m.vm.Destroy(context.Background(), sandboxID))
warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot))
m.slots.Release(slotIdx)
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev))
rollbackCow()
m.loops.Release(baseImagePath)
res.rollback()
return nil, fmt.Errorf("wait for envd: %w", err)
}
waitCancel()
@ -905,7 +902,7 @@ func (m *Manager) FlattenRootfs(ctx context.Context, sandboxID string, teamID, t
func() {
syncCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
defer cancel()
if _, err := sb.client.Exec(syncCtx, "/bin/sync"); err != nil {
if _, err := sb.client.Exec(syncCtx, "/bin/sync", nil, nil); err != nil {
slog.Warn("flatten: guest sync failed (non-fatal)", "id", sb.ID, "error", err)
}
}()
@ -1001,7 +998,7 @@ func (m *Manager) DeleteSnapshot(teamID, templateID pgtype.UUID) error {
// CH handles memory restore internally (with on-demand paging).
// The template's rootfs.ext4 is a flattened standalone image — we create a
// dm-snapshot on top of it just like a normal Create.
func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID string, teamID, templateID pgtype.UUID, vcpus, _, timeoutSec, diskSizeMB int) (*models.Sandbox, error) {
func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID string, teamID, templateID pgtype.UUID, vcpus, memoryMB, timeoutSec, diskSizeMB int) (*models.Sandbox, error) {
tmplDir := layout.TemplateDir(m.cfg.WrennDir, teamID, templateID)
// Set up dm-snapshot on the template's flattened rootfs.
@ -1026,23 +1023,29 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID string, team
return nil, fmt.Errorf("create dm-snapshot: %w", err)
}
res := &createResources{
sandboxID: sandboxID,
loops: m.loops,
loopImage: baseRootfs,
dmDevice: dmDev,
cowPath: cowPath,
slots: m.slots,
}
// Allocate network.
slotIdx, err := m.slots.Allocate()
if err != nil {
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev))
os.Remove(cowPath)
m.loops.Release(baseRootfs)
res.rollback()
return nil, fmt.Errorf("allocate network slot: %w", err)
}
res.slotIdx = slotIdx
slot := network.NewSlot(slotIdx)
if err := network.CreateNetwork(slot); err != nil {
m.slots.Release(slotIdx)
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev))
os.Remove(cowPath)
m.loops.Release(baseRootfs)
res.rollback()
return nil, fmt.Errorf("create network: %w", err)
}
res.slot = slot
// Restore VM from CH snapshot.
vmCfg := vm.VMConfig{
@ -1051,6 +1054,7 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID string, team
KernelPath: m.cfg.KernelPath,
RootfsPath: dmDev.DevicePath,
VCPUs: vcpus,
MemoryMB: memoryMB,
NetworkNamespace: slot.NamespaceID,
TapDevice: slot.TapName,
TapMAC: slot.TapMAC,
@ -1061,13 +1065,10 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID string, team
}
if _, err := m.vm.CreateFromSnapshot(ctx, vmCfg, tmplDir); err != nil {
warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot))
m.slots.Release(slotIdx)
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev))
os.Remove(cowPath)
m.loops.Release(baseRootfs)
res.rollback()
return nil, fmt.Errorf("restore VM from snapshot: %w", err)
}
res.vm = m.vm
// Wait for envd.
client := envdclient.New(slot.HostIP.String())
@ -1075,12 +1076,7 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID string, team
if err := client.WaitUntilReady(waitCtx); err != nil {
waitCancel()
warnErr("vm destroy error", sandboxID, m.vm.Destroy(context.Background(), sandboxID))
warnErr("network cleanup error", sandboxID, network.RemoveNetwork(slot))
m.slots.Release(slotIdx)
warnErr("dm-snapshot remove error", sandboxID, devicemapper.RemoveSnapshot(context.Background(), dmDev))
os.Remove(cowPath)
m.loops.Release(baseRootfs)
res.rollback()
return nil, fmt.Errorf("wait for envd: %w", err)
}
waitCancel()
@ -1109,6 +1105,7 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID string, team
TemplateTeamID: teamID.Bytes,
TemplateID: templateID.Bytes,
VCPUs: vcpus,
MemoryMB: memoryMB,
TimeoutSec: timeoutSec,
SlotIndex: slotIdx,
HostIP: slot.HostIP,
@ -1143,7 +1140,7 @@ func (m *Manager) createFromSnapshot(ctx context.Context, sandboxID string, team
}
// Exec runs a command inside a sandbox.
func (m *Manager) Exec(ctx context.Context, sandboxID string, cmd string, args ...string) (*envdclient.ExecResult, error) {
func (m *Manager) Exec(ctx context.Context, sandboxID string, cmd string, args []string, opts *envdclient.ExecOpts) (*envdclient.ExecResult, error) {
sb, err := m.get(sandboxID)
if err != nil {
return nil, err
@ -1157,7 +1154,7 @@ func (m *Manager) Exec(ctx context.Context, sandboxID string, cmd string, args .
sb.LastActiveAt = time.Now()
m.mu.Unlock()
return sb.client.Exec(ctx, cmd, args...)
return sb.client.Exec(ctx, cmd, args, opts)
}
// ExecStream runs a command inside a sandbox and returns a channel of streaming events.
@ -1538,6 +1535,44 @@ func warnErr(msg string, id string, err error) {
}
}
// createResources tracks partially-acquired resources during sandbox creation
// so they can be rolled back in reverse order on failure.
type createResources struct {
sandboxID string
loops *devicemapper.LoopRegistry
vm *vm.Manager
loopImage string
dmDevice *devicemapper.SnapshotDevice
cowPath string
slotIdx int
slots *network.SlotAllocator
slot *network.Slot
rollCow func() // optional custom cow rollback (e.g. rename back)
}
func (r *createResources) rollback() {
if r.vm != nil && r.sandboxID != "" {
warnErr("vm destroy error", r.sandboxID, r.vm.Destroy(context.Background(), r.sandboxID))
}
if r.slot != nil {
warnErr("network cleanup error", r.sandboxID, network.RemoveNetwork(r.slot))
}
if r.slots != nil && r.slotIdx > 0 {
r.slots.Release(r.slotIdx)
}
if r.dmDevice != nil {
warnErr("dm-snapshot remove error", r.sandboxID, devicemapper.RemoveSnapshot(context.Background(), r.dmDevice))
}
if r.rollCow != nil {
r.rollCow()
} else if r.cowPath != "" {
os.Remove(r.cowPath)
}
if r.loopImage != "" {
r.loops.Release(r.loopImage)
}
}
// startCrashWatcher monitors the VM process for unexpected exits.
// If the process exits while the sandbox is still in m.boxes (i.e. not a
// deliberate Destroy), the sandbox is cleaned up and a sandbox.error event

View File

@ -206,8 +206,3 @@ func (c *chClient) resizeBalloon(ctx context.Context, sizeBytes int64) error {
"desired_balloon": sizeBytes,
})
}
// ping checks if the VMM is alive and ready to accept commands.
func (c *chClient) ping(ctx context.Context) error {
return c.do(ctx, http.MethodGet, "/api/v1/vmm.ping", nil)
}