forked from wrenn/wrenn
feat: async sandbox lifecycle with Redis Stream events
Replace synchronous RPC-based CP-host communication for sandbox lifecycle operations (Create, Pause, Resume, Destroy) with an async pattern. CP handlers now return 202 Accepted immediately, fire agent RPCs in background goroutines, and publish state events to a Redis Stream. A background consumer processes events as a fallback writer. Agent-side auto-pause events are pushed to the CP via HTTP callback (POST /v1/hosts/sandbox-events), keeping Redis internal to the CP. All DB status transitions use conditional updates (UpdateSandboxStatusIf, UpdateSandboxRunningIf) to prevent race conditions between concurrent operations and background goroutines. The HostMonitor reconciler is kept at 60s as a safety net, extended to handle transient statuses (starting, pausing, resuming, stopping). Frontend updated to handle 202 responses with empty bodies and render transient statuses with blue indicators.
This commit is contained in:
129
internal/hostagent/callback.go
Normal file
129
internal/hostagent/callback.go
Normal file
@ -0,0 +1,129 @@
|
||||
package hostagent
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"net/http"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// CallbackEvent is the payload sent to the CP's sandbox event callback endpoint.
|
||||
type CallbackEvent struct {
|
||||
Event string `json:"event"`
|
||||
SandboxID string `json:"sandbox_id"`
|
||||
HostID string `json:"host_id"`
|
||||
Timestamp int64 `json:"timestamp"`
|
||||
}
|
||||
|
||||
// CallbackSender sends sandbox lifecycle events to the CP via HTTP POST.
|
||||
// Used for autonomous agent-side events (auto-pause, auto-destroy) that
|
||||
// the CP cannot observe through its own RPC goroutines.
|
||||
type CallbackSender struct {
|
||||
cpURL string
|
||||
hostID string
|
||||
credFile string
|
||||
client *http.Client
|
||||
mu sync.RWMutex
|
||||
jwt string
|
||||
}
|
||||
|
||||
// NewCallbackSender creates a callback sender.
|
||||
func NewCallbackSender(cpURL, credFile, hostID string) *CallbackSender {
|
||||
jwt := ""
|
||||
if tf, err := LoadTokenFile(credFile); err == nil {
|
||||
jwt = tf.JWT
|
||||
}
|
||||
return &CallbackSender{
|
||||
cpURL: strings.TrimRight(cpURL, "/"),
|
||||
hostID: hostID,
|
||||
credFile: credFile,
|
||||
client: &http.Client{Timeout: 10 * time.Second},
|
||||
jwt: jwt,
|
||||
}
|
||||
}
|
||||
|
||||
// UpdateJWT refreshes the JWT used for callback authentication.
|
||||
// Called from the heartbeat's onCredsRefreshed hook.
|
||||
func (s *CallbackSender) UpdateJWT(jwt string) {
|
||||
s.mu.Lock()
|
||||
s.jwt = jwt
|
||||
s.mu.Unlock()
|
||||
}
|
||||
|
||||
func (s *CallbackSender) getJWT() string {
|
||||
s.mu.RLock()
|
||||
defer s.mu.RUnlock()
|
||||
return s.jwt
|
||||
}
|
||||
|
||||
// Send sends a callback event to the CP synchronously with retries.
|
||||
func (s *CallbackSender) Send(ctx context.Context, ev CallbackEvent) error {
|
||||
ev.HostID = s.hostID
|
||||
if ev.Timestamp == 0 {
|
||||
ev.Timestamp = time.Now().Unix()
|
||||
}
|
||||
|
||||
body, err := json.Marshal(ev)
|
||||
if err != nil {
|
||||
return fmt.Errorf("marshal callback event: %w", err)
|
||||
}
|
||||
|
||||
url := s.cpURL + "/v1/hosts/sandbox-events"
|
||||
|
||||
var lastErr error
|
||||
for attempt := 0; attempt < 3; attempt++ {
|
||||
if attempt > 0 {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case <-time.After(time.Duration(attempt) * 500 * time.Millisecond):
|
||||
}
|
||||
}
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(body))
|
||||
if err != nil {
|
||||
return fmt.Errorf("create callback request: %w", err)
|
||||
}
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
req.Header.Set("X-Host-Token", s.getJWT())
|
||||
|
||||
resp, err := s.client.Do(req)
|
||||
if err != nil {
|
||||
lastErr = err
|
||||
continue
|
||||
}
|
||||
resp.Body.Close()
|
||||
|
||||
if resp.StatusCode == http.StatusUnauthorized || resp.StatusCode == http.StatusForbidden {
|
||||
if newCreds, refreshErr := RefreshCredentials(ctx, s.cpURL, s.credFile); refreshErr == nil {
|
||||
s.UpdateJWT(newCreds.JWT)
|
||||
}
|
||||
lastErr = fmt.Errorf("callback auth failed: %d", resp.StatusCode)
|
||||
continue
|
||||
}
|
||||
|
||||
if resp.StatusCode >= 200 && resp.StatusCode < 300 {
|
||||
return nil
|
||||
}
|
||||
|
||||
lastErr = fmt.Errorf("callback failed: status %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
return fmt.Errorf("callback failed after 3 attempts: %w", lastErr)
|
||||
}
|
||||
|
||||
// SendAsync sends a callback event in a background goroutine.
|
||||
func (s *CallbackSender) SendAsync(ev CallbackEvent) {
|
||||
go func() {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer cancel()
|
||||
if err := s.Send(ctx, ev); err != nil {
|
||||
slog.Warn("callback send failed (reconciler will catch it)", "event", ev.Event, "sandbox_id", ev.SandboxID, "error", err)
|
||||
}
|
||||
}()
|
||||
}
|
||||
Reference in New Issue
Block a user