fix: niceness EPERM for non-root processes, flatten sync

envd: spawn_process wrapped every command in `nice -n {delta}`. current_nice()
used `20 - getpriority()`, but getpriority already returns the nice value, so
delta was -20 for a default-nice envd. Non-root users cannot raise priority, so
the wrapper failed with "cannot set niceness: permission denied" for any process
run as a non-root user. current_nice() now returns the raw value; the wrapper
invokes `nice` only when delta > 0. The oom_score_adj write is kept (always
permitted for raising one's own score).

sandbox: FlattenRootfs used a plain ch.pause, which freezes vCPUs but does not
flush the guest VFS page cache, so freshly written files (e.g. pip installs)
had not reached the block device and the flattened rootfs captured empty files.
Switch to quiesceAndPauseCH (envd /snapshot/prepare: sync + drop_caches), as
CreateSnapshot and Pause already do, and reset the connection tracker after
resume on both the success and quiesce-failure paths.
This commit is contained in:
2026-05-20 01:34:15 +06:00
parent 21c837aa02
commit f06d03996a
2 changed files with 31 additions and 7 deletions

View File

@ -169,11 +169,22 @@ pub fn spawn_process(
env.push((k.clone(), v.clone()));
}
// Reset the child's nice value only when envd itself was started at an
// elevated nice value (delta > 0 means raising the nice number / lowering
// priority, which is permitted for non-root processes). A non-root process
// cannot improve its priority, so skip the `nice` wrapper otherwise — it
// would fail with EPERM ("cannot set niceness: permission denied") for
// commands run as a non-root user. Writing 100 to the process's own
// oom_score_adj is always permitted (raising the score).
let nice_delta = 0 - current_nice();
let oom_script = format!(
r#"echo 100 > /proc/$$/oom_score_adj && exec /usr/bin/nice -n {} "${{@}}""#,
nice_delta
);
let oom_script = if nice_delta > 0 {
format!(
r#"echo 100 > /proc/$$/oom_score_adj && exec /usr/bin/nice -n {} "${{@}}""#,
nice_delta
)
} else {
r#"echo 100 > /proc/$$/oom_score_adj && exec "$@""#.to_string()
};
let mut wrapper_args = vec![
"-c".to_string(),
oom_script,
@ -435,6 +446,8 @@ fn current_nice() -> i32 {
if *libc::__errno_location() != 0 {
return 0;
}
20 - prio
// getpriority(PRIO_PROCESS, 0) returns the nice value directly,
// in the range [-20, 19]; the normal default is 0.
prio
}
}

View File

@ -808,13 +808,24 @@ func (m *Manager) FlattenRootfs(ctx context.Context, sandboxID string, teamID, t
}
defer os.RemoveAll(stageDir)
if err := m.vm.Pause(ctx, sandboxID); err != nil {
return 0, fmt.Errorf("vm pause for flatten: %w", err)
// quiesceAndPauseCH drains connections and calls envd /snapshot/prepare
// (sync + drop_caches) before ch.pause. A plain ch.pause only freezes the
// vCPUs — guest VFS page-cache writes (e.g. freshly pip-installed files)
// would not yet have reached the block device, so the flattened rootfs
// would capture empty files. Matches CreateSnapshot and Pause.
if err := m.quiesceAndPauseCH(ctx, sb); err != nil {
// quiesceAndPauseCH force-closes tracked connections before ch.pause.
// On failure, resume and reset so the sandbox doesn't get stuck
// refusing new proxy connections. Mirrors CreateSnapshot.
_ = m.vm.Resume(context.Background(), sandboxID)
sb.connTracker.Reset()
return 0, fmt.Errorf("quiesce for flatten: %w", err)
}
flattenErr := devicemapper.FlattenSnapshot(sb.dmDevice.DevicePath, filepath.Join(stageDir, "rootfs.ext4"))
if rerr := m.vm.Resume(context.Background(), sandboxID); rerr != nil {
slog.Warn("vm resume after flatten", "id", sandboxID, "error", rerr)
}
sb.connTracker.Reset()
if flattenErr != nil {
return 0, fmt.Errorf("flatten: %w", flattenErr)
}