1
0
forked from wrenn/wrenn

fix: resolve process stream hangs, pause race, and PTY signal loss

- Cache terminal EndEvent on ProcessHandle so connect() can detect
  already-exited processes instead of hanging forever on broadcast
  receivers that missed the event. Subscribe before checking cache
  to close the TOCTOU window.

- Protect sb.Status writes in Pause with m.mu to prevent data race
  with concurrent readers (AcquireProxyConn, Exec, etc.).

- Restart metrics sampler in restoreRunning so a failed pause attempt
  doesn't permanently kill sandbox metrics collection.

- Return dequeued non-input messages from coalescePtyInput instead of
  dropping them, preventing silent loss of kill/resize signals during
  typing bursts.
This commit is contained in:
2026-05-09 18:11:15 +06:00
parent 522e1c5e90
commit aca43d51eb
4 changed files with 101 additions and 71 deletions

View File

@ -37,6 +37,7 @@ pub struct ProcessHandle {
data_tx: broadcast::Sender<DataEvent>,
end_tx: broadcast::Sender<EndEvent>,
ended: Mutex<Option<EndEvent>>,
stdin: Mutex<Option<std::process::ChildStdin>>,
pty_master: Mutex<Option<std::fs::File>>,
@ -51,6 +52,10 @@ impl ProcessHandle {
self.end_tx.subscribe()
}
pub fn cached_end(&self) -> Option<EndEvent> {
self.ended.lock().unwrap().clone()
}
pub fn send_signal(&self, sig: Signal) -> Result<(), ConnectError> {
signal::kill(Pid::from_raw(self.pid as i32), sig).map_err(|e| {
ConnectError::new(ErrorCode::Internal, format!("error sending signal: {e}"))
@ -250,6 +255,7 @@ pub fn spawn_process(
pid,
data_tx: data_tx.clone(),
end_tx: end_tx.clone(),
ended: Mutex::new(None),
stdin: Mutex::new(None),
pty_master: Mutex::new(Some(master_file)),
});
@ -273,26 +279,25 @@ pub fn spawn_process(
});
let end_tx_clone = end_tx.clone();
let handle_for_waiter = Arc::clone(&handle);
std::thread::spawn(move || {
let mut child = child;
match child.wait() {
Ok(s) => {
let _ = end_tx_clone.send(EndEvent {
let end_event = match child.wait() {
Ok(s) => EndEvent {
exit_code: s.code().unwrap_or(-1),
exited: s.code().is_some(),
status: format!("{s}"),
error: None,
});
}
Err(e) => {
let _ = end_tx_clone.send(EndEvent {
},
Err(e) => EndEvent {
exit_code: -1,
exited: false,
status: "error".into(),
error: Some(e.to_string()),
});
}
}
},
};
*handle_for_waiter.ended.lock().unwrap() = Some(end_event.clone());
let _ = end_tx_clone.send(end_event);
});
tracing::info!(pid, cmd = cmd_str, "process started (pty)");
@ -336,6 +341,7 @@ pub fn spawn_process(
pid,
data_tx: data_tx.clone(),
end_tx: end_tx.clone(),
ended: Mutex::new(None),
stdin: Mutex::new(stdin),
pty_master: Mutex::new(None),
});
@ -376,25 +382,24 @@ pub fn spawn_process(
}
let end_tx_clone = end_tx.clone();
let handle_for_waiter = Arc::clone(&handle);
std::thread::spawn(move || {
match child.wait() {
Ok(s) => {
let _ = end_tx_clone.send(EndEvent {
let end_event = match child.wait() {
Ok(s) => EndEvent {
exit_code: s.code().unwrap_or(-1),
exited: s.code().is_some(),
status: format!("{s}"),
error: None,
});
}
Err(e) => {
let _ = end_tx_clone.send(EndEvent {
},
Err(e) => EndEvent {
exit_code: -1,
exited: false,
status: "error".into(),
error: Some(e.to_string()),
});
}
}
},
};
*handle_for_waiter.ended.lock().unwrap() = Some(end_event.clone());
let _ = end_tx_clone.send(end_event);
});
tracing::info!(pid, cmd = cmd_str, "process started (pipe)");

View File

@ -237,6 +237,7 @@ impl Process for ProcessServiceImpl {
let mut data_rx = handle.subscribe_data();
let mut end_rx = handle.subscribe_end();
let cached_end = handle.cached_end();
let stream = async_stream::stream! {
yield Ok(ConnectResponse {
@ -249,6 +250,12 @@ impl Process for ProcessServiceImpl {
..Default::default()
});
if let Some(end) = cached_end {
yield Ok(ConnectResponse {
event: buffa::MessageField::some(make_end_event(end)),
..Default::default()
});
} else {
loop {
tokio::select! {
biased;
@ -281,6 +288,7 @@ impl Process for ProcessServiceImpl {
}
}
}
}
};
Ok((Box::pin(stream), ctx))

View File

@ -350,9 +350,23 @@ func runPtyLoop(
defer wg.Done()
defer cancel()
for msg := range inputCh {
// Use a background context for unary RPCs so they complete
// even if the stream context is being cancelled.
// pending holds a non-input message dequeued during coalescing
// that must be processed on the next iteration.
var pending *wsPtyIn
for {
var msg wsPtyIn
if pending != nil {
msg = *pending
pending = nil
} else {
var ok bool
msg, ok = <-inputCh
if !ok {
break
}
}
rpcCtx, rpcCancel := context.WithTimeout(context.Background(), 5*time.Second)
switch msg.Type {
@ -364,7 +378,7 @@ func runPtyLoop(
}
// Coalesce: drain any queued input messages into a single RPC.
data = coalescePtyInput(inputCh, data)
data, pending = coalescePtyInput(inputCh, data)
if _, err := agent.PtySendInput(rpcCtx, connect.NewRequest(&pb.PtySendInputRequest{
SandboxId: sandboxID,
@ -430,19 +444,17 @@ func runPtyLoop(
// coalescePtyInput drains any immediately-available "input" messages from the
// channel and appends their decoded data to buf, reducing RPC call volume
// during bursts of fast typing.
func coalescePtyInput(ch <-chan wsPtyIn, buf []byte) []byte {
// during bursts of fast typing. Returns the coalesced buffer and any
// non-input message that was dequeued (must be processed by the caller).
func coalescePtyInput(ch <-chan wsPtyIn, buf []byte) ([]byte, *wsPtyIn) {
for {
select {
case msg, ok := <-ch:
if !ok {
return buf
return buf, nil
}
if msg.Type != "input" {
// Non-input message — can't coalesce. Put-back isn't possible
// with channels, but resize/kill during a typing burst is rare
// enough that dropping one is acceptable.
return buf
return buf, &msg
}
data, err := base64.StdEncoding.DecodeString(msg.Data)
if err != nil {
@ -450,7 +462,7 @@ func coalescePtyInput(ch <-chan wsPtyIn, buf []byte) []byte {
}
buf = append(buf, data...)
default:
return buf
return buf, nil
}
}
}

View File

@ -378,13 +378,18 @@ func (m *Manager) Pause(ctx context.Context, sandboxID string) error {
}
// Mark sandbox as pausing to block new exec/file/PTY operations.
m.mu.Lock()
sb.Status = models.StatusPausing
m.mu.Unlock()
// restoreRunning reverts state if any pre-freeze step fails.
restoreRunning := func() {
_ = m.vm.UpdateBalloon(context.Background(), sandboxID, 0)
sb.connTracker.Reset()
m.mu.Lock()
sb.Status = models.StatusRunning
m.mu.Unlock()
m.startSampler(sb)
}
// Stop the metrics sampler goroutine before tearing down any resources