Add host agent RPC server with sandbox lifecycle management

Implement the host agent as a Connect RPC server that orchestrates sandbox creation, destruction, pause/resume, and command execution. Includes sandbox manager with TTL-based reaper, network slot allocator, rootfs cloning, hostagent proto definition with generated stubs, and test/debug scripts. Fix Firecracker process lifetime bug where VM was tied to HTTP request context instead of background context.
2026-03-10 03:54:53 +06:00
parent c31ce90306
commit 6f0c365d44
24 changed files with 6236 additions and 121 deletions
--- a/scripts/test-host.sh
+++ b/scripts/test-host.sh
@ -0,0 +1,233 @@
+#!/usr/bin/env bash
+#
+# test-host.sh — Integration test for the Wrenn host agent.
+#
+# Prerequisites:
+#   - Host agent running: sudo ./builds/wrenn-agent
+#   - Firecracker installed at /usr/local/bin/firecracker
+#   - Kernel at /var/lib/wrenn/kernels/vmlinux
+#   - Base rootfs at /var/lib/wrenn/images/minimal.ext4 (with envd + wrenn-init baked in)
+#
+# Usage:
+#   ./scripts/test-host.sh [agent_url]
+#
+# The agent URL defaults to http://localhost:50051.
+
+set -euo pipefail
+
+AGENT="${1:-http://localhost:50051}"
+BASE="/hostagent.v1.HostAgentService"
+SANDBOX_ID=""
+
+# Colors for output.
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[0;33m'
+NC='\033[0m'
+
+pass() { echo -e "${GREEN}PASS${NC}: $1"; }
+fail() { echo -e "${RED}FAIL${NC}: $1"; exit 1; }
+info() { echo -e "${YELLOW}----${NC}: $1"; }
+
+rpc() {
+    local method="$1"
+    local body="$2"
+    curl -s -X POST \
+        -H "Content-Type: application/json" \
+        "${AGENT}${BASE}/${method}" \
+        -d "${body}"
+}
+
+# ──────────────────────────────────────────────────
+# Test 1: List sandboxes (should be empty)
+# ──────────────────────────────────────────────────
+info "Test 1: List sandboxes (expect empty)"
+
+RESULT=$(rpc "ListSandboxes" '{}')
+echo "  Response: ${RESULT}"
+
+echo "${RESULT}" | grep -q '"sandboxes"' || echo "${RESULT}" | grep -q '{}' && \
+    pass "ListSandboxes returned" || \
+    fail "ListSandboxes failed"
+
+# ──────────────────────────────────────────────────
+# Test 2: Create a sandbox
+# ──────────────────────────────────────────────────
+info "Test 2: Create a sandbox"
+
+RESULT=$(rpc "CreateSandbox" '{
+    "template": "minimal",
+    "vcpus": 1,
+    "memoryMb": 512,
+    "timeoutSec": 300
+}')
+echo "  Response: ${RESULT}"
+
+SANDBOX_ID=$(echo "${RESULT}" | python3 -c "import sys,json; print(json.load(sys.stdin)['sandboxId'])" 2>/dev/null) || \
+    fail "CreateSandbox did not return sandboxId"
+
+echo "  Sandbox ID: ${SANDBOX_ID}"
+pass "Sandbox created: ${SANDBOX_ID}"
+
+# ──────────────────────────────────────────────────
+# Test 3: List sandboxes (should have one)
+# ──────────────────────────────────────────────────
+info "Test 3: List sandboxes (expect one)"
+
+RESULT=$(rpc "ListSandboxes" '{}')
+echo "  Response: ${RESULT}"
+
+echo "${RESULT}" | grep -q "${SANDBOX_ID}" && \
+    pass "Sandbox ${SANDBOX_ID} found in list" || \
+    fail "Sandbox not found in list"
+
+# ──────────────────────────────────────────────────
+# Test 4: Execute a command
+# ──────────────────────────────────────────────────
+info "Test 4: Execute 'echo hello world'"
+
+RESULT=$(rpc "Exec" "{
+    \"sandboxId\": \"${SANDBOX_ID}\",
+    \"cmd\": \"/bin/sh\",
+    \"args\": [\"-c\", \"echo hello world\"],
+    \"timeoutSec\": 10
+}")
+echo "  Response: ${RESULT}"
+
+# stdout is base64-encoded in Connect RPC JSON.
+STDOUT=$(echo "${RESULT}" | python3 -c "
+import sys, json, base64
+r = json.load(sys.stdin)
+print(base64.b64decode(r['stdout']).decode().strip())
+" 2>/dev/null) || fail "Exec did not return stdout"
+
+[ "${STDOUT}" = "hello world" ] && \
+    pass "Exec returned correct output: '${STDOUT}'" || \
+    fail "Expected 'hello world', got '${STDOUT}'"
+
+# ──────────────────────────────────────────────────
+# Test 5: Execute a multi-line command
+# ──────────────────────────────────────────────────
+info "Test 5: Execute multi-line command"
+
+RESULT=$(rpc "Exec" "{
+    \"sandboxId\": \"${SANDBOX_ID}\",
+    \"cmd\": \"/bin/sh\",
+    \"args\": [\"-c\", \"echo line1; echo line2; echo line3\"],
+    \"timeoutSec\": 10
+}")
+echo "  Response: ${RESULT}"
+
+LINE_COUNT=$(echo "${RESULT}" | python3 -c "
+import sys, json, base64
+r = json.load(sys.stdin)
+lines = base64.b64decode(r['stdout']).decode().strip().split('\n')
+print(len(lines))
+" 2>/dev/null)
+
+[ "${LINE_COUNT}" = "3" ] && \
+    pass "Multi-line output: ${LINE_COUNT} lines" || \
+    fail "Expected 3 lines, got ${LINE_COUNT}"
+
+# ──────────────────────────────────────────────────
+# Test 6: Pause the sandbox
+# ──────────────────────────────────────────────────
+info "Test 6: Pause sandbox"
+
+RESULT=$(rpc "PauseSandbox" "{\"sandboxId\": \"${SANDBOX_ID}\"}")
+echo "  Response: ${RESULT}"
+
+# Verify status is paused.
+LIST=$(rpc "ListSandboxes" '{}')
+echo "${LIST}" | grep -q '"paused"' && \
+    pass "Sandbox paused" || \
+    fail "Sandbox not in paused state"
+
+# ──────────────────────────────────────────────────
+# Test 7: Exec should fail while paused
+# ──────────────────────────────────────────────────
+info "Test 7: Exec while paused (expect error)"
+
+RESULT=$(rpc "Exec" "{
+    \"sandboxId\": \"${SANDBOX_ID}\",
+    \"cmd\": \"/bin/echo\",
+    \"args\": [\"should fail\"]
+}")
+echo "  Response: ${RESULT}"
+
+echo "${RESULT}" | grep -qi "not running\|error\|code" && \
+    pass "Exec correctly rejected while paused" || \
+    fail "Exec should have failed while paused"
+
+# ──────────────────────────────────────────────────
+# Test 8: Resume the sandbox
+# ──────────────────────────────────────────────────
+info "Test 8: Resume sandbox"
+
+RESULT=$(rpc "ResumeSandbox" "{\"sandboxId\": \"${SANDBOX_ID}\"}")
+echo "  Response: ${RESULT}"
+
+# Verify status is running.
+LIST=$(rpc "ListSandboxes" '{}')
+echo "${LIST}" | grep -q '"running"' && \
+    pass "Sandbox resumed" || \
+    fail "Sandbox not in running state"
+
+# ──────────────────────────────────────────────────
+# Test 9: Exec after resume
+# ──────────────────────────────────────────────────
+info "Test 9: Exec after resume"
+
+RESULT=$(rpc "Exec" "{
+    \"sandboxId\": \"${SANDBOX_ID}\",
+    \"cmd\": \"/bin/sh\",
+    \"args\": [\"-c\", \"echo resumed ok\"],
+    \"timeoutSec\": 10
+}")
+echo "  Response: ${RESULT}"
+
+STDOUT=$(echo "${RESULT}" | python3 -c "
+import sys, json, base64
+r = json.load(sys.stdin)
+print(base64.b64decode(r['stdout']).decode().strip())
+" 2>/dev/null) || fail "Exec after resume failed"
+
+[ "${STDOUT}" = "resumed ok" ] && \
+    pass "Exec after resume works: '${STDOUT}'" || \
+    fail "Expected 'resumed ok', got '${STDOUT}'"
+
+# ──────────────────────────────────────────────────
+# Test 10: Destroy the sandbox
+# ──────────────────────────────────────────────────
+info "Test 10: Destroy sandbox"
+
+RESULT=$(rpc "DestroySandbox" "{\"sandboxId\": \"${SANDBOX_ID}\"}")
+echo "  Response: ${RESULT}"
+pass "Sandbox destroyed"
+
+# ──────────────────────────────────────────────────
+# Test 11: List sandboxes (should be empty again)
+# ──────────────────────────────────────────────────
+info "Test 11: List sandboxes (expect empty)"
+
+RESULT=$(rpc "ListSandboxes" '{}')
+echo "  Response: ${RESULT}"
+
+echo "${RESULT}" | grep -q "${SANDBOX_ID}" && \
+    fail "Destroyed sandbox still in list" || \
+    pass "Sandbox list is clean"
+
+# ──────────────────────────────────────────────────
+# Test 12: Destroy non-existent sandbox (expect error)
+# ──────────────────────────────────────────────────
+info "Test 12: Destroy non-existent sandbox (expect error)"
+
+RESULT=$(rpc "DestroySandbox" '{"sandboxId": "sb-nonexist"}')
+echo "  Response: ${RESULT}"
+
+echo "${RESULT}" | grep -qi "not found\|error\|code" && \
+    pass "Correctly rejected non-existent sandbox" || \
+    fail "Should have returned error for non-existent sandbox"
+
+echo ""
+echo -e "${GREEN}All tests passed!${NC}"
--- a/scripts/update-debug-rootfs.sh
+++ b/scripts/update-debug-rootfs.sh
@ -0,0 +1,73 @@
+#!/usr/bin/env bash
+#
+# update-debug-rootfs.sh — Build envd and inject it (plus wrenn-init) into the debug rootfs.
+#
+# This script:
+#   1. Builds a fresh envd static binary via make
+#   2. Mounts the rootfs image
+#   3. Copies envd and wrenn-init into the image
+#   4. Unmounts cleanly
+#
+# Usage:
+#   bash scripts/update-debug-rootfs.sh [rootfs_path]
+#
+# Defaults to /var/lib/wrenn/images/minimal.ext4
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+PROJECT_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
+ROOTFS="${1:-/var/lib/wrenn/images/minimal.ext4}"
+MOUNT_DIR="/tmp/wrenn-rootfs-update"
+
+if [ ! -f "${ROOTFS}" ]; then
+    echo "ERROR: Rootfs not found at ${ROOTFS}"
+    exit 1
+fi
+
+# Step 1: Build envd.
+echo "==> Building envd..."
+cd "${PROJECT_ROOT}"
+make build-envd
+ENVD_BIN="${PROJECT_ROOT}/builds/envd"
+
+if [ ! -f "${ENVD_BIN}" ]; then
+    echo "ERROR: envd binary not found at ${ENVD_BIN}"
+    exit 1
+fi
+
+# Verify it's statically linked.
+if ! file "${ENVD_BIN}" | grep -q "statically linked"; then
+    echo "ERROR: envd is not statically linked!"
+    exit 1
+fi
+
+# Step 2: Mount the rootfs.
+echo "==> Mounting rootfs at ${MOUNT_DIR}..."
+mkdir -p "${MOUNT_DIR}"
+sudo mount -o loop "${ROOTFS}" "${MOUNT_DIR}"
+
+cleanup() {
+    echo "==> Unmounting rootfs..."
+    sudo umount "${MOUNT_DIR}" 2>/dev/null || true
+    rmdir "${MOUNT_DIR}" 2>/dev/null || true
+}
+trap cleanup EXIT
+
+# Step 3: Copy files into rootfs.
+echo "==> Installing envd..."
+sudo mkdir -p "${MOUNT_DIR}/usr/local/bin"
+sudo cp "${ENVD_BIN}" "${MOUNT_DIR}/usr/local/bin/envd"
+sudo chmod 755 "${MOUNT_DIR}/usr/local/bin/envd"
+
+echo "==> Installing wrenn-init..."
+sudo cp "${PROJECT_ROOT}/images/wrenn-init.sh" "${MOUNT_DIR}/usr/local/bin/wrenn-init"
+sudo chmod 755 "${MOUNT_DIR}/usr/local/bin/wrenn-init"
+
+# Step 4: Verify.
+echo ""
+echo "==> Installed files:"
+ls -la "${MOUNT_DIR}/usr/local/bin/envd" "${MOUNT_DIR}/usr/local/bin/wrenn-init"
+
+echo ""
+echo "==> Done. Rootfs updated: ${ROOTFS}"