From 5b4fde055c3e070c7effbfe1cf62e241dbdc4218 Mon Sep 17 00:00:00 2001 From: pptx704 Date: Wed, 15 Apr 2026 18:24:54 +0600 Subject: [PATCH 1/2] Fix build recipe execution and flatten reliability - Set HOME in bctx.EnvVars when USER switches so ~ expands correctly in subsequent RUN/WORKDIR steps instead of resolving to /root - Run /bin/sync inside the guest before FlattenRootfs destroys the VM, preventing pip-installed files from being captured as 0-byte due to unflushed page cache - Wrap healthcheck command with su so it runs with the template's default user context (correct HOME, correct UID) - Export Shellescape from the recipe package for use in build service - Add code-runner-beta recipe (Jupyter server with ipykernel --sys-prefix) and replace old python-interpreter-v0-beta --- internal/recipe/context.go | 7 +++++-- internal/recipe/executor.go | 9 +++++++++ internal/sandbox/manager.go | 12 ++++++++++++ internal/service/build.go | 13 ++++++++++--- recipes/code-runner-beta.healthcheck | 1 + recipes/code-runner-beta.recipefile | 9 +++++++++ recipes/python-interpreter-v0-beta.healthcheck | 1 - recipes/python-interpreter-v0-beta.recipefile | 7 ------- 8 files changed, 46 insertions(+), 13 deletions(-) create mode 100644 recipes/code-runner-beta.healthcheck create mode 100644 recipes/code-runner-beta.recipefile delete mode 100644 recipes/python-interpreter-v0-beta.healthcheck delete mode 100644 recipes/python-interpreter-v0-beta.recipefile diff --git a/internal/recipe/context.go b/internal/recipe/context.go index 820e717..3a64059 100644 --- a/internal/recipe/context.go +++ b/internal/recipe/context.go @@ -115,8 +115,11 @@ func expandEnv(s string, vars map[string]string) string { }) } -// shellescape wraps s in single quotes, escaping any embedded single quotes. +// Shellescape wraps s in single quotes, escaping any embedded single quotes. // This is POSIX-safe for paths, env values, and shell commands. -func shellescape(s string) string { +func Shellescape(s string) string { return "'" + strings.ReplaceAll(s, "'", `'\''`) + "'" } + +// shellescape is the package-internal alias for Shellescape. +func shellescape(s string) string { return Shellescape(s) } diff --git a/internal/recipe/executor.go b/internal/recipe/executor.go index 38a8b12..ffecf04 100644 --- a/internal/recipe/executor.go +++ b/internal/recipe/executor.go @@ -193,6 +193,15 @@ func execUser( entry := execRawShell(ctx, st.Raw, sandboxID, phase, step, 30*time.Second, execFn, script) if entry.Ok { bctx.User = username + // Update HOME so ~ expands correctly in subsequent RUN/WORKDIR steps. + if bctx.EnvVars == nil { + bctx.EnvVars = make(map[string]string) + } + if username == "root" { + bctx.EnvVars["HOME"] = "/root" + } else { + bctx.EnvVars["HOME"] = "/home/" + username + } } return entry, entry.Ok } diff --git a/internal/sandbox/manager.go b/internal/sandbox/manager.go index 3dde053..576a3e9 100644 --- a/internal/sandbox/manager.go +++ b/internal/sandbox/manager.go @@ -901,6 +901,18 @@ func (m *Manager) FlattenRootfs(ctx context.Context, sandboxID string, teamID, t return 0, fmt.Errorf("sandbox %s not found", sandboxID) } + // Flush guest page cache to disk before stopping the VM. Without this, + // files written by the build (e.g. pip-installed packages) may exist in the + // guest's page cache but not yet on the dm block device — flatten would then + // capture 0-byte files. + func() { + syncCtx, cancel := context.WithTimeout(ctx, 10*time.Second) + defer cancel() + if _, err := sb.client.Exec(syncCtx, "/bin/sync"); err != nil { + slog.Warn("flatten: guest sync failed (non-fatal)", "id", sb.ID, "error", err) + } + }() + // Stop the VM but keep the dm device alive for flattening. m.stopSampler(sb) if err := m.vm.Destroy(ctx, sb.ID); err != nil { diff --git a/internal/service/build.go b/internal/service/build.go index 563c8cd..92d826d 100644 --- a/internal/service/build.go +++ b/internal/service/build.go @@ -444,7 +444,7 @@ func (s *BuildService) executeBuild(ctx context.Context, buildIDStr string) { return } log.Info("running healthcheck", "cmd", hc.Cmd, "interval", hc.Interval, "timeout", hc.Timeout, "start_period", hc.StartPeriod, "retries", hc.Retries) - if err := s.waitForHealthcheck(buildCtx, agent, sandboxIDStr, hc); err != nil { + if err := s.waitForHealthcheck(buildCtx, agent, sandboxIDStr, hc, templateDefaultUser); err != nil { s.destroySandbox(buildCtx, agent, sandboxIDStr) if buildCtx.Err() != nil { return @@ -544,7 +544,14 @@ func (s *BuildService) executeBuild(ctx context.Context, buildIDStr string) { // During the start period, failures are not counted toward the retry budget. // Returns nil on the first successful check, or an error if retries are // exhausted, the deadline passes, or the context is cancelled. -func (s *BuildService) waitForHealthcheck(ctx context.Context, agent buildAgentClient, sandboxIDStr string, hc recipe.HealthcheckConfig) error { +func (s *BuildService) waitForHealthcheck(ctx context.Context, agent buildAgentClient, sandboxIDStr string, hc recipe.HealthcheckConfig, user string) error { + // Wrap the healthcheck command with su when a non-root user is set, so that + // ~ expands to the correct home directory and the process runs with the + // right UID (matching the template's default user). + cmd := hc.Cmd + if user != "" && user != "root" { + cmd = "su " + recipe.Shellescape(user) + " -s /bin/sh -c " + recipe.Shellescape(hc.Cmd) + } ticker := time.NewTicker(hc.Interval) defer ticker.Stop() @@ -571,7 +578,7 @@ func (s *BuildService) waitForHealthcheck(ctx context.Context, agent buildAgentC resp, err := agent.Exec(execCtx, connect.NewRequest(&pb.ExecRequest{ SandboxId: sandboxIDStr, Cmd: "/bin/sh", - Args: []string{"-c", hc.Cmd}, + Args: []string{"-c", cmd}, TimeoutSec: int32(hc.Timeout.Seconds()), })) cancel() diff --git a/recipes/code-runner-beta.healthcheck b/recipes/code-runner-beta.healthcheck new file mode 100644 index 0000000..186da39 --- /dev/null +++ b/recipes/code-runner-beta.healthcheck @@ -0,0 +1 @@ +--interval=5s --timeout=5s --start-period=60s --retries=5 curl -sf http://127.0.0.1:8888/api/status diff --git a/recipes/code-runner-beta.recipefile b/recipes/code-runner-beta.recipefile new file mode 100644 index 0000000..dc96779 --- /dev/null +++ b/recipes/code-runner-beta.recipefile @@ -0,0 +1,9 @@ +RUN --timeout=5m sudo apt install -y python3 python3-pip python3-venv +ENV PYTHONUNBUFFERED=1 + +RUN python3 -m venv ~/jupyter-env +RUN --timeout=5m ~/jupyter-env/bin/pip install --upgrade pip +RUN --timeout=5m ~/jupyter-env/bin/pip install jupyter-server ipykernel +RUN --timeout=5m ~/jupyter-env/bin/python -m ipykernel install --sys-prefix + +START ~/jupyter-env/bin/jupyter server --ServerApp.ip=0.0.0.0 --ServerApp.port=8888 --ServerApp.token='' --ServerApp.password='' --ServerApp.allow_origin='*' --ServerApp.disable_check_xsrf=True --no-browser --log-level=INFO diff --git a/recipes/python-interpreter-v0-beta.healthcheck b/recipes/python-interpreter-v0-beta.healthcheck deleted file mode 100644 index ca2555c..0000000 --- a/recipes/python-interpreter-v0-beta.healthcheck +++ /dev/null @@ -1 +0,0 @@ ---interval=5s --timeout=3s --start-period=3s --retries=3 python3 -c "import urllib.request; urllib.request.urlopen('http://127.0.0.1:8888/api/status', timeout=3)" diff --git a/recipes/python-interpreter-v0-beta.recipefile b/recipes/python-interpreter-v0-beta.recipefile deleted file mode 100644 index e83f5da..0000000 --- a/recipes/python-interpreter-v0-beta.recipefile +++ /dev/null @@ -1,7 +0,0 @@ -RUN apt-get install -y --no-install-recommends python3 python3-pip python3-venv -RUN python3 -m venv /opt/venv -ENV PATH=/opt/venv/bin:$PATH - -RUN --timeout=5m pip install --no-cache-dir jupyter-server ipykernel - -START jupyter server --ServerApp.ip=0.0.0.0 --ServerApp.port=8888 --ServerApp.token='' --ServerApp.allow_origin='*' --ServerApp.disable_check_xsrf=True --no-browser --allow-root From 5f877afb9e4cc92a952490d115996ab84d46899f Mon Sep 17 00:00:00 2001 From: pptx704 Date: Wed, 15 Apr 2026 18:31:48 +0600 Subject: [PATCH 2/2] Remove PTY inactivity timeout to keep terminal sessions alive indefinitely Sessions now only end on process exit or explicit kill, not idle time. The keepalive ping every 30s remains to prevent network-level disconnects. --- internal/api/handlers_pty.go | 39 ------------------------------------ internal/api/openapi.yaml | 1 - 2 files changed, 40 deletions(-) diff --git a/internal/api/handlers_pty.go b/internal/api/handlers_pty.go index 6a906b1..cd5dcae 100644 --- a/internal/api/handlers_pty.go +++ b/internal/api/handlers_pty.go @@ -23,7 +23,6 @@ import ( ) const ( - ptyInactivityTimeout = 120 * time.Second ptyKeepaliveInterval = 30 * time.Second ptyDefaultCmd = "/bin/bash" ptyDefaultCols = 80 @@ -246,10 +245,6 @@ func runPtyLoop( ) { var wg sync.WaitGroup - // Inactivity timer — reset on input/resize, fires kill after timeout. - timer := time.NewTimer(ptyInactivityTimeout) - defer timer.Stop() - // Output pump: read from Connect stream, write to WebSocket. wg.Add(1) go func() { @@ -317,7 +312,6 @@ func runPtyLoop( })); err != nil { slog.Debug("pty send input error", "error", err) } - resetTimer(timer, ptyInactivityTimeout) case "resize": cols := msg.Cols @@ -331,7 +325,6 @@ func runPtyLoop( })); err != nil { slog.Debug("pty resize error", "error", err) } - resetTimer(timer, ptyInactivityTimeout) } case "kill": @@ -364,26 +357,6 @@ func runPtyLoop( } }() - // Inactivity timeout goroutine. - wg.Add(1) - go func() { - defer wg.Done() - select { - case <-timer.C: - slog.Info("pty session timed out", "sandbox_id", sandboxID, "tag", tag) - rpcCtx, rpcCancel := context.WithTimeout(context.Background(), 5*time.Second) - if _, err := agent.PtyKill(rpcCtx, connect.NewRequest(&pb.PtyKillRequest{ - SandboxId: sandboxID, - Tag: tag, - })); err != nil { - slog.Debug("pty timeout kill error", "error", err) - } - rpcCancel() - cancel() - case <-ctx.Done(): - } - }() - wg.Wait() } @@ -391,15 +364,3 @@ func runPtyLoop( func newPtyTag() string { return "pty-" + id.NewPtyTag() } - -// resetTimer safely resets a timer by stopping it and draining the channel -// before resetting, avoiding the race documented in time.Timer.Reset. -func resetTimer(t *time.Timer, d time.Duration) { - if !t.Stop() { - select { - case <-t.C: - default: - } - } - t.Reset(d) -} diff --git a/internal/api/openapi.yaml b/internal/api/openapi.yaml index 031cefd..984a37d 100644 --- a/internal/api/openapi.yaml +++ b/internal/api/openapi.yaml @@ -1386,7 +1386,6 @@ paths: PTY data (input and output) is base64-encoded because it contains raw terminal bytes (escape sequences, control codes) that are not valid UTF-8. - Sessions have a 120-second inactivity timeout (reset on input/resize). Sessions persist across WebSocket disconnections — the process keeps running in the capsule. Use the `tag` from the "started" response to reconnect later.