From 007717956aec00246b672a0f475defb77a303279 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 19 Jun 2026 19:28:10 +0000 Subject: [PATCH] feat: Add optional `runner.post_task_script` hook after task cleanup (#1026) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Adds `runner.post_task_script` and `runner.post_task_script_timeout` (default `5m`) to run a host executable after each task’s built-in cleanup (post-steps, container teardown, bind-workdir removal). - Stops task heartbeats via `Reporter.StopHeartbeats()` while the script runs so Gitea won’t assign overlapping work; the final task acknowledgement still happens in `reporter.Close()`. - Script output goes to the runner process log; non-zero exits are warned only and do not change the job result. - Documents lifecycle, offline behavior, timeouts, and Windows limits (`.ps1` not supported yet) in `docs/post-task-script.md`. Reviewed-on: https://gitea.com/gitea/runner/pulls/1026 Reviewed-by: Zettat123 <39446+zettat123@noreply.gitea.com> --- README.md | 10 ++ act/container/host_environment.go | 41 ++--- act/container/process_other.go | 29 ---- act/container/process_unix.go | 56 ------- act/container/process_windows.go | 71 -------- act/container/util.go | 13 -- act/container/util_openbsd_mips64.go | 7 - act/container/util_plan9.go | 7 - act/container/util_windows.go | 5 - docs/post-task-script.md | 155 +++++++++++++++++ go.sum | 2 - internal/app/run/post_task_script.go | 132 +++++++++++++++ internal/app/run/post_task_script_test.go | 157 ++++++++++++++++++ internal/app/run/runner.go | 3 + internal/pkg/config/config.example.yaml | 15 ++ internal/pkg/config/config.go | 51 +++--- internal/pkg/config/config_test.go | 28 ++++ internal/pkg/process/killer_plan9.go | 29 ++++ internal/pkg/process/killer_unix.go | 56 +++++++ .../pkg/process/killer_unix_test.go | 19 ++- internal/pkg/process/killer_windows.go | 72 ++++++++ .../pkg/process/killer_windows_test.go | 14 +- internal/pkg/process/sysprocattr_plan9.go | 17 ++ internal/pkg/process/sysprocattr_unix.go | 24 +++ internal/pkg/process/sysprocattr_windows.go | 14 ++ internal/pkg/process/treekill.go | 66 ++++++++ internal/pkg/report/reporter.go | 30 +++- internal/pkg/report/reporter_test.go | 62 +++++++ 28 files changed, 922 insertions(+), 263 deletions(-) delete mode 100644 act/container/process_other.go delete mode 100644 act/container/process_unix.go delete mode 100644 act/container/process_windows.go create mode 100644 docs/post-task-script.md create mode 100644 internal/app/run/post_task_script.go create mode 100644 internal/app/run/post_task_script_test.go create mode 100644 internal/pkg/process/killer_plan9.go create mode 100644 internal/pkg/process/killer_unix.go rename act/container/process_unix_test.go => internal/pkg/process/killer_unix_test.go (85%) create mode 100644 internal/pkg/process/killer_windows.go rename act/container/process_windows_test.go => internal/pkg/process/killer_windows_test.go (83%) create mode 100644 internal/pkg/process/sysprocattr_plan9.go create mode 100644 internal/pkg/process/sysprocattr_unix.go create mode 100644 internal/pkg/process/sysprocattr_windows.go create mode 100644 internal/pkg/process/treekill.go diff --git a/README.md b/README.md index cb5b594b..ad03a664 100644 --- a/README.md +++ b/README.md @@ -209,6 +209,16 @@ When `container.bind_workdir` is enabled, stale task workspace directories can b - only purely numeric subdirectories under `container.workdir_parent` are treated as task workspaces and may be removed - cleanup assumes `container.workdir_parent` is not shared across multiple runners +#### Post-task script (`runner.post_task_script`) + +Optional host script that runs **after** each task's built-in cleanup (post-steps, container teardown, bind-workdir removal). Use it for extra machine housekeeping — Docker pruning, disk cleanup, and similar. + +**While the script runs, the runner stops task heartbeats and stays offline from Gitea's perspective until the script exits (or hits `runner.post_task_script_timeout`, default `5m`).** A script that blocks without exiting keeps the runner from taking new work for up to that timeout. Script output goes to the runner log, not the job log; a non-zero exit is warned but does not change the job result. + +On Windows, use `.exe`, `.bat`, or `.cmd` paths; **PowerShell (`.ps1`) is not supported yet** as the configured path — wrap commands in a `.cmd` file instead. + +See **[docs/post-task-script.md](docs/post-task-script.md)** for lifecycle details, environment variables, timeout interaction, and platform notes. + ### Example Deployments Check out the [examples](examples) directory for sample deployment types. diff --git a/act/container/host_environment.go b/act/container/host_environment.go index 5c69eb13..bf52e09e 100644 --- a/act/container/host_environment.go +++ b/act/container/host_environment.go @@ -23,6 +23,7 @@ import ( "gitea.com/gitea/runner/act/common" "gitea.com/gitea/runner/act/filecollector" "gitea.com/gitea/runner/act/lookpath" + "gitea.com/gitea/runner/internal/pkg/process" "github.com/go-git/go-billy/v5/helper/polyfill" "github.com/go-git/go-billy/v5/osfs" @@ -261,7 +262,7 @@ func setupPty(cmd *exec.Cmd, cmdline string) (*os.File, *os.File, error) { cmd.Stdin = tty cmd.Stdout = tty cmd.Stderr = tty - cmd.SysProcAttr = getSysProcAttr(cmdline, true) + cmd.SysProcAttr = process.SysProcAttr(cmdline, true) return ppty, tty, nil } @@ -321,30 +322,14 @@ func (e *HostEnvironment) exec(ctx context.Context, command []string, cmdline st cmd.Env = envList cmd.Stderr = e.StdOut cmd.Dir = wd - cmd.SysProcAttr = getSysProcAttr(cmdline, false) + cmd.SysProcAttr = process.SysProcAttr(cmdline, false) - // A step often launches a process tree (a shell that starts a child which - // spawns further background or GUI processes). The default context - // cancellation only kills the direct child, leaving the rest of the tree - // running; and because the orphans inherit cmd's stdout/stderr pipe, - // cmd.Wait() would block forever, hanging the runner. Kill the whole tree on - // cancellation — via a Job Object on Windows and the process group on Unix - // (see processKiller) — and bound the wait so a leftover pipe writer can - // never hang Wait indefinitely. - var killer atomic.Pointer[processKiller] - cmd.Cancel = func() error { - if k := killer.Load(); k != nil { - return k.Kill() - } - if cmd.Process != nil { - return cmd.Process.Kill() - } - return nil - } - // Once the step process has exited, give its I/O pipes at most this long to - // drain before Wait force-closes them and returns (Go's WaitDelay). This - // also covers a step that backgrounds a process holding the pipe open. - cmd.WaitDelay = 10 * time.Second + // Kill the step's whole process tree on cancellation (a step often launches a + // shell that spawns further background or GUI children) and bound the post-exit + // I/O wait, so an orphan inheriting cmd's stdout/stderr pipe can never hang + // cmd.Wait() and the runner. See process.TreeKill. The PTY path below may + // override SysProcAttr, but never touches Cancel/WaitDelay. + treeKill := process.NewTreeKill(cmd) var ppty *os.File var tty *os.File @@ -375,15 +360,9 @@ func (e *HostEnvironment) exec(ctx context.Context, command []string, cmdline st if err := cmd.Start(); err != nil { return err } - // Capture the started process for tree-kill on cancellation: a Job Object on - // Windows (children spawned afterwards are auto-included) and the process - // group on Unix. On failure (e.g. Windows nested-job restrictions) we fall - // back to the default single-process kill; WaitDelay + end-of-job cleanup - // still apply. - if k, kerr := newProcessKiller(cmd.Process); kerr != nil { + if k, kerr := treeKill.Capture(cmd.Process); kerr != nil { common.Logger(ctx).Warnf("process tree kill setup failed, falling back to single-process kill: %v", kerr) } else { - killer.Store(k) defer k.Close() } err = cmd.Wait() diff --git a/act/container/process_other.go b/act/container/process_other.go deleted file mode 100644 index ffb1d14a..00000000 --- a/act/container/process_other.go +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2026 The Gitea Authors. All rights reserved. -// SPDX-License-Identifier: MIT - -//go:build plan9 - -package container - -import "os" - -// processKiller falls back to single-process termination on platforms without -// a process-group / Job Object tree-kill. The Job Object (Windows) and process -// group (Unix) based tree-kills live in process_windows.go / process_unix.go; -// here we just kill the direct child, matching the previous default behaviour. -type processKiller struct { - p *os.Process -} - -func newProcessKiller(p *os.Process) (*processKiller, error) { - return &processKiller{p: p}, nil -} - -func (k *processKiller) Kill() error { - if k == nil || k.p == nil { - return nil - } - return k.p.Kill() -} - -func (k *processKiller) Close() error { return nil } diff --git a/act/container/process_unix.go b/act/container/process_unix.go deleted file mode 100644 index 49b43525..00000000 --- a/act/container/process_unix.go +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright 2026 The Gitea Authors. All rights reserved. -// SPDX-License-Identifier: MIT - -//go:build !windows && !plan9 - -package container - -import ( - "errors" - "os" - "syscall" -) - -// processKiller terminates a step process together with its whole process -// group, which is the Unix counterpart of the Windows Job Object tree-kill. -// -// Background: a step often launches a process tree (a shell that starts a child -// which in turn spawns further background processes). The default -// exec.CommandContext cancellation only kills the direct child, so cancelling a -// job left the rest of the tree running. Because those orphans inherited the -// step's stdout/stderr pipe, cmd.Wait() also blocked forever and the runner -// hung. -// -// Steps are started with Setpgid (or Setsid for the PTY path, see -// getSysProcAttr), which makes the step process the leader of a new process -// group whose ID equals its PID. Signalling the negative PID delivers to every -// process still in that group, so we can tear down the whole tree atomically on -// cancellation, which also closes the inherited pipe handles so cmd.Wait() can -// return. -type processKiller struct { - pgid int -} - -// newProcessKiller captures the process group of p (an already-started -// process). Because the step is launched with Setpgid/Setsid, p is a group -// leader and its PGID equals its PID; children spawned afterwards stay in the -// same group unless they explicitly create their own. -func newProcessKiller(p *os.Process) (*processKiller, error) { - return &processKiller{pgid: p.Pid}, nil -} - -// Kill sends SIGKILL to the entire process group (the step process and every -// descendant that stayed in the group). A missing group (ESRCH) means the -// processes already exited and is not treated as an error. -func (k *processKiller) Kill() error { - if k == nil || k.pgid <= 0 { - return nil - } - if err := syscall.Kill(-k.pgid, syscall.SIGKILL); err != nil && !errors.Is(err, syscall.ESRCH) { - return err - } - return nil -} - -// Close is a no-op on Unix; there is no job handle to release. -func (k *processKiller) Close() error { return nil } diff --git a/act/container/process_windows.go b/act/container/process_windows.go deleted file mode 100644 index 7ad61dca..00000000 --- a/act/container/process_windows.go +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright 2026 The Gitea Authors. All rights reserved. -// SPDX-License-Identifier: MIT - -package container - -import ( - "os" - - "golang.org/x/sys/windows" -) - -// processKiller terminates a step process together with its entire descendant -// tree via a Windows Job Object. -// -// Background: a step often launches a process tree (a shell that starts a -// child which in turn spawns further GUI or background processes). The default -// exec.CommandContext cancellation only kills the direct child, so cancelling a -// job left the rest of the tree running. Because those orphans inherited the -// step's stdout/stderr pipe, cmd.Wait() also blocked forever and the runner hung. -// -// Assigning the step process to a Job Object lets us kill the whole tree -// atomically on cancellation (TerminateJobObject), which also closes the -// inherited pipe handles so cmd.Wait() can return. -type processKiller struct { - job windows.Handle -} - -// newProcessKiller creates a Job Object and assigns p (an already-started -// process) to it. Children spawned by p afterwards are automatically part of -// the job. The job does NOT use JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE, so closing -// the handle on normal completion does not kill legitimate background -// processes; the tree is only torn down by an explicit Kill (cancellation). -func newProcessKiller(p *os.Process) (*processKiller, error) { - job, err := windows.CreateJobObject(nil, nil) - if err != nil { - return nil, err - } - - h, err := windows.OpenProcess(windows.PROCESS_SET_QUOTA|windows.PROCESS_TERMINATE, false, uint32(p.Pid)) - if err != nil { - windows.CloseHandle(job) - return nil, err - } - defer windows.CloseHandle(h) - - if err := windows.AssignProcessToJobObject(job, h); err != nil { - windows.CloseHandle(job) - return nil, err - } - - return &processKiller{job: job}, nil -} - -// Kill terminates every process currently assigned to the job (the step process -// and all of its descendants). -func (k *processKiller) Kill() error { - if k == nil || k.job == 0 { - return nil - } - return windows.TerminateJobObject(k.job, 1) -} - -// Close releases the job handle. It does not terminate the processes. -func (k *processKiller) Close() error { - if k == nil || k.job == 0 { - return nil - } - h := k.job - k.job = 0 - return windows.CloseHandle(h) -} diff --git a/act/container/util.go b/act/container/util.go index 3409cc1d..1b606511 100644 --- a/act/container/util.go +++ b/act/container/util.go @@ -8,23 +8,10 @@ package container import ( "os" - "syscall" "github.com/creack/pty" ) -func getSysProcAttr(_ string, tty bool) *syscall.SysProcAttr { - if tty { - return &syscall.SysProcAttr{ - Setsid: true, - Setctty: true, - } - } - return &syscall.SysProcAttr{ - Setpgid: true, - } -} - func openPty() (*os.File, *os.File, error) { return pty.Open() } diff --git a/act/container/util_openbsd_mips64.go b/act/container/util_openbsd_mips64.go index 40c83181..859dd027 100644 --- a/act/container/util_openbsd_mips64.go +++ b/act/container/util_openbsd_mips64.go @@ -7,15 +7,8 @@ package container import ( "errors" "os" - "syscall" ) -func getSysProcAttr(cmdLine string, tty bool) *syscall.SysProcAttr { - return &syscall.SysProcAttr{ - Setpgid: true, - } -} - func openPty() (*os.File, *os.File, error) { return nil, nil, errors.New("Unsupported") } diff --git a/act/container/util_plan9.go b/act/container/util_plan9.go index bb79c30d..859dd027 100644 --- a/act/container/util_plan9.go +++ b/act/container/util_plan9.go @@ -7,15 +7,8 @@ package container import ( "errors" "os" - "syscall" ) -func getSysProcAttr(cmdLine string, tty bool) *syscall.SysProcAttr { - return &syscall.SysProcAttr{ - Rfork: syscall.RFNOTEG, - } -} - func openPty() (*os.File, *os.File, error) { return nil, nil, errors.New("Unsupported") } diff --git a/act/container/util_windows.go b/act/container/util_windows.go index f3d6e8bb..859dd027 100644 --- a/act/container/util_windows.go +++ b/act/container/util_windows.go @@ -7,13 +7,8 @@ package container import ( "errors" "os" - "syscall" ) -func getSysProcAttr(cmdLine string, tty bool) *syscall.SysProcAttr { - return &syscall.SysProcAttr{CmdLine: cmdLine, CreationFlags: syscall.CREATE_NEW_PROCESS_GROUP} -} - func openPty() (*os.File, *os.File, error) { return nil, nil, errors.New("Unsupported") } diff --git a/docs/post-task-script.md b/docs/post-task-script.md new file mode 100644 index 00000000..aa279936 --- /dev/null +++ b/docs/post-task-script.md @@ -0,0 +1,155 @@ +# Post-task script + +The post-task script is an optional host hook that runs **once after every task**, after the runner has already finished its normal per-task cleanup. Typical uses include pruning Docker images, vacuuming ephemeral disks, or resetting VM state between jobs. + +It is configured under `runner.post_task_script` in the runner YAML config (see [config.example.yaml](../internal/pkg/config/config.example.yaml)). + +## When it runs + +For each task, execution order is: + +1. Workflow runs (steps, actions, containers). +2. In-job cleanup (action `post:` steps, container stop/remove). +3. Job outputs are reported to Gitea. +4. Bind-workdir workspace removal, when `container.bind_workdir` is enabled. +5. **Post-task script** (this hook). +6. Final task acknowledgement to Gitea (`reporter.Close()`). + +The script is **additive**: it does not replace any built-in cleanup. When `container.bind_workdir` is enabled, the task workspace directory has usually already been deleted before the script starts. `GITEA_WORKSPACE` is still set to the path the job used, for reference. + +## Runner stays offline until the script finishes + +This is the most important operational detail. + +When the post-task script starts, the runner **stops sending task heartbeats** to Gitea (the same mechanism used during cancel/cleanup). From Gitea's perspective, the runner is **not available for new work** until: + +1. The script exits (success or failure), **and** +2. The runner sends the final task flush to Gitea. + +While the script runs: + +- **Gitea will not assign another task** to this runner for the current job slot (heartbeats are stopped). +- **The runner capacity slot stays occupied** locally — with `capacity: 1`, the poller will not start another task until the script completes. +- **Runner shutdown** (`shutdown_timeout`) counts this phase as part of the in-flight task; a long or stuck script delays graceful shutdown. + +If the script **never exits**, the runner remains in this state until `runner.post_task_script_timeout` elapses (default **5 minutes** when a script is configured). The runner then kills the script process and proceeds to the final acknowledgement. Until that timeout fires, **the runner effectively stays offline**. + +Set `post_task_script_timeout` to a value that matches how long your housekeeping is allowed to take — not how long you wish it could take. Prefer short, bounded scripts. + +### Recommendations + +- Keep scripts **fast and bounded** (seconds, not minutes). +- Avoid interactive prompts, blocking network calls without timeouts, or waiting on user input. +- Use **idempotent** operations (the script may run after success, failure, or cancellation). +- Test failure modes: hung script, non-zero exit, missing executable. +- Watch the **runner process log** for script output (it is not written to the Gitea job log). +- On shutdown, ensure scripts respond to process termination within `post_task_script_timeout`. + +## Configuration + +```yaml +runner: + # Path to an executable on the host. Empty or omitted disables the hook. + post_task_script: /usr/local/bin/gitea-post-task.sh + + # Hard limit on script runtime. Default when post_task_script is set: 5m. + # If the script exceeds this, it is killed and the runner continues. + post_task_script_timeout: 2m +``` + +| Option | Default | Description | +| --- | --- | --- | +| `runner.post_task_script` | *(disabled)* | Host path to the script or binary. Relative paths are resolved from the runner process working directory. | +| `runner.post_task_script_timeout` | `5m` (only when script is set) | Maximum time the script may run before the runner kills it and moves on. | + +The script must be **executable** on the host (shebang on Linux/macOS, or a native `.exe` / `.bat` / `.cmd` on Windows). **PowerShell (`.ps1`) is not supported yet** as the value of `post_task_script`; the runner executes the configured path directly and does not invoke `powershell.exe` for you. + +`gitea-runner exec` does **not** load runner YAML and will not run this hook. + +## Environment variables + +The script receives `runner.envs` / `runner.env_file` values plus: + +| Variable | Description | +| --- | --- | +| `GITEA_TASK_ID` | Numeric task ID. | +| `GITEA_RUN_ID` | Workflow run ID, when provided by the server. | +| `GITEA_REPOSITORY` | Repository slug (`owner/name`). | +| `GITEA_WORKSPACE` | Workspace path used for the job (may already be deleted). | +| `GITEA_JOB_RESULT` | `success`, `failure`, `cancelled`, `skipped`, or `unknown`. | + +The script environment is **not** a full copy of the job container environment. System variables such as `PATH` are only present if you define them in `runner.envs` or `runner.env_file`. + +## Output and errors + +- **Stdout/stderr** are written to the **runner process log** (logrus), prefixed with `post-task script stdout:` / `post-task script stderr:`. +- **Non-zero exit codes** are logged as warnings only. They do **not** change the job result already reported to Gitea. +- **Timeouts and start failures** are logged as warnings; the runner still completes the task acknowledgement. + +## Interaction with other timeouts + +| Timeout | Effect on post-task script | +| --- | --- | +| `runner.post_task_script_timeout` | Kills the script if it runs too long. This is the **only** timeout that bounds the script. | +| `runner.timeout` | Caps the task **up to** the script. The script detaches from the task deadline, so a job near the runner timeout limit does **not** cut the script short — it still gets its full `post_task_script_timeout`. | +| `runner.shutdown_timeout` | On SIGINT/SIGTERM, bounds how long the runner waits for the **task** to finish. The post-task script detaches from cancellation, so it is **not** interrupted by this window and may extend shutdown until its own `post_task_script_timeout` elapses. | + +## Examples + +### Linux — prune dangling Docker resources + +`/usr/local/bin/gitea-post-task.sh`: + +```sh +#!/bin/sh +set -eu +docker image prune -f +docker builder prune -f --filter 'until=24h' +``` + +`config.yaml`: + +```yaml +runner: + post_task_script: /usr/local/bin/gitea-post-task.sh + post_task_script_timeout: 3m +``` + +### Windows — batch file (`.cmd`) + +Use a `.cmd` or `.bat` file. PowerShell scripts are **not supported yet** as `post_task_script`; call PowerShell from a batch wrapper if needed: + +`C:\gitea-runner\scripts\post-task.cmd`: + +```bat +@echo off +docker image prune -f +``` + +```yaml +runner: + post_task_script: C:\gitea-runner\scripts\post-task.cmd + post_task_script_timeout: 3m +``` + +PowerShell workaround until native `.ps1` support exists: + +`C:\gitea-runner\scripts\post-task.cmd`: + +```bat +@echo off +powershell.exe -NoProfile -NonInteractive -ExecutionPolicy Bypass -File "%~dp0post-task.ps1" +``` + +## Windows notes + +- Supported as `post_task_script`: `.exe`, `.bat`, `.cmd`. +- **Not supported yet:** `.ps1` as the configured path (use a `.cmd` wrapper; see above). +- `.sh` files require a Unix shell on the PATH unless you point `post_task_script` at the interpreter. +- Use backslashes or forward slashes in YAML paths; both work in Go on Windows. + +## See also + +- [Configuration](../README.md#configuration) — generating and loading `config.yaml` +- [config.example.yaml](../internal/pkg/config/config.example.yaml) — all runner options +- Bind-workdir idle cleanup (`runner.workdir_cleanup_age`) — separate from this hook; runs only when the runner is idle diff --git a/go.sum b/go.sum index 0acf3477..51f3fc47 100644 --- a/go.sum +++ b/go.sum @@ -253,8 +253,6 @@ golang.org/x/sys v0.15.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.46.0 h1:noSf2Fq6F8DBgS+LysIkx7rIExoNHJsxOAtPp4rthXw= golang.org/x/sys v0.46.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= -golang.org/x/term v0.43.0 h1:S4RLU2sB31O/NCl+zFN9Aru9A/Cq2aqKpTZJ6B+DwT4= -golang.org/x/term v0.43.0/go.mod h1:lrhlHNdQJHO+1qVYiHfFKVuVioJIheAc3fBSMFYEIsk= golang.org/x/term v0.44.0 h1:0rLvDRCtNj0gZkyIXhCyOb2OAzEhLVqc4B+hrsBhrmc= golang.org/x/term v0.44.0/go.mod h1:7ze4MdzUzLXpSAoFP1H0bOI9aXDqveSvatT5vKcFh2Y= golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= diff --git a/internal/app/run/post_task_script.go b/internal/app/run/post_task_script.go new file mode 100644 index 00000000..d0688831 --- /dev/null +++ b/internal/app/run/post_task_script.go @@ -0,0 +1,132 @@ +// Copyright 2026 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package run + +import ( + "context" + "errors" + "fmt" + "io" + "os/exec" + "strconv" + "strings" + "time" + + "gitea.com/gitea/runner/act/common" + "gitea.com/gitea/runner/internal/pkg/config" + "gitea.com/gitea/runner/internal/pkg/metrics" + "gitea.com/gitea/runner/internal/pkg/process" + "gitea.com/gitea/runner/internal/pkg/report" + + runnerv1 "gitea.dev/actions-proto-go/runner/v1" + log "github.com/sirupsen/logrus" +) + +func (r *Runner) runPostTaskScript(ctx context.Context, reporter *report.Reporter, task *runnerv1.Task, workdir string) { + script := r.cfg.Runner.PostTaskScript + if script == "" { + return + } + + timeout := r.cfg.Runner.PostTaskScriptTimeout + if timeout <= 0 { + timeout = config.DefaultPostTaskScriptTimeout + } + + scriptCtx, cancel := postTaskScriptContext(ctx, timeout) + defer cancel() + + env := r.postTaskScriptEnv(reporter, task, workdir) + log.Infof("running post-task script %q for task %d", script, task.Id) + + cmd := exec.CommandContext(scriptCtx, script) + cmd.Env = envListFromMap(env) + cmd.SysProcAttr = process.SysProcAttr(script, false) + + stdout := postTaskScriptLogWriter("stdout") + stderr := postTaskScriptLogWriter("stderr") + cmd.Stdout = stdout + cmd.Stderr = stderr + + // Kill the script's whole process tree on cancellation and bound the post-exit + // I/O wait, so a backgrounded child inheriting cmd's stdout/stderr pipe can + // never hang cmd.Wait() and the runner. See process.TreeKill. + treeKill := process.NewTreeKill(cmd) + + if err := cmd.Start(); err != nil { + log.Warnf("post-task script %q for task %d: %v", script, task.Id, err) + return + } + if k, kerr := treeKill.Capture(cmd.Process); kerr != nil { + log.Warnf("post-task script %q for task %d: process tree kill setup failed, falling back to single-process kill: %v", script, task.Id, kerr) + } else { + defer k.Close() + } + + err := cmd.Wait() + // Flush any trailing, not-yet-newline-terminated output now that the I/O + // copiers have finished (cmd.Wait, bounded by WaitDelay above, guarantees it). + common.FlushWriter(stdout) + common.FlushWriter(stderr) + if err != nil { + if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) { + log.Warnf("post-task script %q for task %d: %v", script, task.Id, err) + return + } + var exitErr *exec.ExitError + if errors.As(err, &exitErr) { + log.Warnf("post-task script %q for task %d exited with code %d", script, task.Id, exitErr.ExitCode()) + return + } + log.Warnf("post-task script %q for task %d: %v", script, task.Id, err) + } +} + +func postTaskScriptContext(ctx context.Context, timeout time.Duration) (context.Context, context.CancelFunc) { + // Detach from the task context's deadline and cancellation: the task has + // already finished by the time the post-task script runs, so the script must + // get its full configured timeout. Inheriting the task deadline would silently + // truncate that budget when the job completed close to its own timeout (and an + // already-cancelled task context would skip the script entirely). + // context.WithoutCancel keeps the context values while dropping the deadline. + return context.WithTimeout(context.WithoutCancel(ctx), timeout) +} + +func (r *Runner) postTaskScriptEnv(reporter *report.Reporter, task *runnerv1.Task, workdir string) map[string]string { + env := r.cloneEnvs() + env["GITEA_TASK_ID"] = strconv.FormatInt(task.Id, 10) + env["GITEA_WORKSPACE"] = workdir + // GITEA_JOB_RESULT shares the runner's canonical result vocabulary + // (success/failure/cancelled/skipped/unknown), the same strings the reporter + // parses and the metrics labels use. + env["GITEA_JOB_RESULT"] = metrics.ResultToStatusLabel(reporter.Result()) + if v := task.Context.Fields["run_id"].GetStringValue(); v != "" { + env["GITEA_RUN_ID"] = v + } + if v := task.Context.Fields["repository"].GetStringValue(); v != "" { + env["GITEA_REPOSITORY"] = v + } + return env +} + +func envListFromMap(env map[string]string) []string { + envList := make([]string, 0, len(env)) + for k, v := range env { + envList = append(envList, fmt.Sprintf("%s=%s", k, v)) + } + return envList +} + +// postTaskScriptLogWriter returns an io.Writer that logs the script's output one +// line at a time, tagged with the stream name. It is passed as cmd.Stdout/Stderr +// (rather than a StdoutPipe) so that cmd.WaitDelay governs the copying goroutine: +// a backgrounded process holding the pipe open can never block cmd.Wait() +// indefinitely. Flush any trailing partial line with common.FlushWriter after +// cmd.Wait() returns. +func postTaskScriptLogWriter(stream string) io.Writer { + return common.NewLineWriter(func(line string) bool { + log.Infof("post-task script %s: %s", stream, strings.TrimRight(line, "\r\n")) + return true + }) +} diff --git a/internal/app/run/post_task_script_test.go b/internal/app/run/post_task_script_test.go new file mode 100644 index 00000000..ae7e2eeb --- /dev/null +++ b/internal/app/run/post_task_script_test.go @@ -0,0 +1,157 @@ +// Copyright 2026 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package run + +import ( + "context" + "os" + "path/filepath" + "testing" + "time" + + "gitea.com/gitea/runner/internal/pkg/config" + "gitea.com/gitea/runner/internal/pkg/metrics" + "gitea.com/gitea/runner/internal/pkg/report" + + runnerv1 "gitea.dev/actions-proto-go/runner/v1" + log "github.com/sirupsen/logrus" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "google.golang.org/protobuf/types/known/structpb" +) + +func TestRunPostTaskScriptSkippedWhenEmpty(t *testing.T) { + r := &Runner{ + cfg: &config.Config{}, + } + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + taskCtx, err := structpb.NewStruct(map[string]any{}) + require.NoError(t, err) + task := &runnerv1.Task{Id: 1, Context: taskCtx} + reporter := report.NewReporter(ctx, cancel, nil, task, r.cfg) + + require.NotPanics(t, func() { + r.runPostTaskScript(ctx, reporter, task, "/workspace/owner/repo") + }) +} + +func TestRunPostTaskScriptNonZeroExitDoesNotPanic(t *testing.T) { + dir := t.TempDir() + scriptPath := filepath.Join(dir, "fail.sh") + require.NoError(t, os.WriteFile(scriptPath, []byte("#!/bin/sh\nexit 2\n"), 0o700)) + + cfg, err := config.LoadDefault("") + require.NoError(t, err) + cfg.Runner.PostTaskScript = scriptPath + + r := &Runner{cfg: cfg} + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + taskCtx, err := structpb.NewStruct(map[string]any{}) + require.NoError(t, err) + task := &runnerv1.Task{Id: 1, Context: taskCtx} + reporter := report.NewReporter(ctx, cancel, nil, task, cfg) + + require.NotPanics(t, func() { + r.runPostTaskScript(ctx, reporter, task, "/workspace/owner/repo") + }) +} + +func TestPostTaskScriptContextUsesFullTimeout(t *testing.T) { + const timeout = 5 * time.Minute + + // A task context that finished close to its own deadline must not truncate the + // script's budget: the script should still get its full configured timeout. + near, cancelNear := context.WithTimeout(context.Background(), time.Second) + defer cancelNear() + scriptCtx, cancel := postTaskScriptContext(near, timeout) + defer cancel() + deadline, ok := scriptCtx.Deadline() + require.True(t, ok) + assert.Greater(t, time.Until(deadline), time.Minute, "script timeout truncated to task deadline") + + // An already-cancelled task context must not cancel the script either. + cancelledCtx, cancelIt := context.WithCancel(context.Background()) + cancelIt() + scriptCtx2, cancel2 := postTaskScriptContext(cancelledCtx, timeout) + defer cancel2() + assert.NoError(t, scriptCtx2.Err(), "script context inherited the cancelled task context") +} + +func TestPostTaskScriptEnv(t *testing.T) { + cfg, err := config.LoadDefault("") + require.NoError(t, err) + + r := &Runner{ + cfg: cfg, + envs: map[string]string{"BASE": "1"}, + } + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + taskCtx, err := structpb.NewStruct(map[string]any{ + "run_id": "99", + "repository": "acme/widget", + }) + require.NoError(t, err) + task := &runnerv1.Task{Id: 3, Context: taskCtx} + reporter := report.NewReporter(ctx, cancel, nil, task, cfg) + setReporterJobResult(t, reporter, runnerv1.Result_RESULT_FAILURE) + + env := r.postTaskScriptEnv(reporter, task, "/tmp/workspace") + assert.Equal(t, "1", env["BASE"]) + assert.Equal(t, "3", env["GITEA_TASK_ID"]) + assert.Equal(t, "99", env["GITEA_RUN_ID"]) + assert.Equal(t, "acme/widget", env["GITEA_REPOSITORY"]) + assert.Equal(t, "/tmp/workspace", env["GITEA_WORKSPACE"]) + assert.Equal(t, "failure", env["GITEA_JOB_RESULT"]) +} + +func TestRunPostTaskScriptIntegration(t *testing.T) { + dir := t.TempDir() + outFile := filepath.Join(dir, "out.txt") + scriptPath := filepath.Join(dir, "post-task.sh") + script := "#!/bin/sh\nprintf '%s %s %s' \"$GITEA_TASK_ID\" \"$GITEA_JOB_RESULT\" \"$CUSTOM\" > \"" + outFile + "\"\n" + require.NoError(t, os.WriteFile(scriptPath, []byte(script), 0o700)) + + cfg, err := config.LoadDefault("") + require.NoError(t, err) + cfg.Runner.PostTaskScript = scriptPath + + r := &Runner{ + cfg: cfg, + envs: map[string]string{"CUSTOM": "runner-env"}, + } + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + taskCtx, err := structpb.NewStruct(map[string]any{}) + require.NoError(t, err) + task := &runnerv1.Task{Id: 11, Context: taskCtx} + reporter := report.NewReporter(ctx, cancel, nil, task, cfg) + setReporterJobResult(t, reporter, runnerv1.Result_RESULT_SUCCESS) + + r.runPostTaskScript(ctx, reporter, task, "/workspace/acme/repo") + + content, err := os.ReadFile(outFile) + require.NoError(t, err) + assert.Equal(t, "11 success runner-env", string(content)) +} + +func setReporterJobResult(t *testing.T, reporter *report.Reporter, result runnerv1.Result) { + t.Helper() + require.NoError(t, reporter.Fire(&log.Entry{ + Time: time.Now(), + Message: "job finished", + Data: log.Fields{ + "stage": "Post", + "jobResult": metrics.ResultToStatusLabel(result), + }, + })) +} diff --git a/internal/app/run/runner.go b/internal/app/run/runner.go index cb48cea9..16e5788b 100644 --- a/internal/app/run/runner.go +++ b/internal/app/run/runner.go @@ -475,6 +475,9 @@ func (r *Runner) run(ctx context.Context, task *runnerv1.Task, reporter *report. } } + reporter.StopHeartbeats() + r.runPostTaskScript(ctx, reporter, task, workdir) + return execErr } diff --git a/internal/pkg/config/config.example.yaml b/internal/pkg/config/config.example.yaml index fe31138d..31fedef4 100644 --- a/internal/pkg/config/config.example.yaml +++ b/internal/pkg/config/config.example.yaml @@ -83,6 +83,21 @@ runner: # terminal; tools like `docker build` emit redrawing progress frames into the captured log # when a TTY is present. allocate_pty: false + # Optional executable on the host, run once after each task's built-in cleanup + # (post-steps, container teardown, bind-workdir removal). Additive only. + # + # IMPORTANT: While this script runs the runner stops task heartbeats and stays + # offline from Gitea's perspective until the script exits. A script that never + # returns blocks new work until post_task_script_timeout kills it (default 5m). + # Keep scripts short; set post_task_script_timeout to a safe upper bound. + # + # Output -> runner process log (not the job log). Non-zero exit -> warning only. + # Windows: use .exe, .bat, or .cmd. PowerShell (.ps1) is not supported yet as + # the configured path; wrap PowerShell commands in a .cmd file instead. + # Full guide: docs/post-task-script.md + post_task_script: '' + # Hard limit on post_task_script runtime. Default if omitted: 5m. + post_task_script_timeout: 5m cache: # Enable the built-in cache server (used by actions/cache and similar actions). diff --git a/internal/pkg/config/config.go b/internal/pkg/config/config.go index ebd2e952..e9e5da0f 100644 --- a/internal/pkg/config/config.go +++ b/internal/pkg/config/config.go @@ -16,6 +16,12 @@ import ( "go.yaml.in/yaml/v4" ) +// DefaultPostTaskScriptTimeout is the fallback cap on how long the post-task +// script may run when post_task_script is set without an explicit timeout. It is +// applied both at config load (for a configured script) and at the point of use +// (so a programmatically built config still gets a sane bound). +const DefaultPostTaskScriptTimeout = 5 * time.Minute + // Log represents the configuration for logging. type Log struct { Level string `yaml:"level"` // Level indicates the logging level. @@ -23,26 +29,28 @@ type Log struct { // Runner represents the configuration for the runner. type Runner struct { - File string `yaml:"file"` // File specifies the file path for the runner. - Capacity int `yaml:"capacity"` // Capacity specifies the capacity of the runner. - Envs map[string]string `yaml:"envs"` // Envs stores environment variables for the runner. - EnvFile string `yaml:"env_file"` // EnvFile specifies the path to the file containing environment variables for the runner. - Timeout time.Duration `yaml:"timeout"` // Timeout specifies the duration for runner timeout. - ShutdownTimeout time.Duration `yaml:"shutdown_timeout"` // ShutdownTimeout specifies the duration to wait for running jobs to complete during a shutdown of the runner. - Insecure bool `yaml:"insecure"` // Insecure indicates whether the runner operates in an insecure mode. - FetchTimeout time.Duration `yaml:"fetch_timeout"` // FetchTimeout specifies the timeout duration for fetching resources. - FetchInterval time.Duration `yaml:"fetch_interval"` // FetchInterval specifies the interval duration for fetching resources. - FetchIntervalMax time.Duration `yaml:"fetch_interval_max"` // FetchIntervalMax specifies the maximum backoff interval when idle. - WorkdirCleanupAge time.Duration `yaml:"workdir_cleanup_age"` // WorkdirCleanupAge removes stale bind-workdir task directories and orphaned host-mode scratch dirs older than this duration during idle cleanup. - IdleCleanupInterval time.Duration `yaml:"idle_cleanup_interval"` // IdleCleanupInterval runs stale-directory cleanup periodically while the runner is idle. Set to 0 to disable cleanup cadence. - LogReportInterval time.Duration `yaml:"log_report_interval"` // LogReportInterval specifies the base interval for periodic log flush. - LogReportMaxLatency time.Duration `yaml:"log_report_max_latency"` // LogReportMaxLatency specifies the max time a log row can wait before being sent. - LogReportBatchSize int `yaml:"log_report_batch_size"` // LogReportBatchSize triggers immediate log flush when buffer reaches this size. - StateReportInterval time.Duration `yaml:"state_report_interval"` // StateReportInterval specifies the interval for state reporting. - ReportCloseTimeout time.Duration `yaml:"report_close_timeout"` // ReportCloseTimeout caps each RPC attempt when flushing the final logs and task state at job completion, on a detached context so a server cancel can't block the acknowledgement. - Labels []string `yaml:"labels"` // Labels specify the labels of the runner. Labels are declared on each startup - GithubMirror string `yaml:"github_mirror"` // GithubMirror defines what mirrors should be used when using github - AllocatePTY bool `yaml:"allocate_pty"` // AllocatePTY allocates a pseudo-TTY for each step's process. Default is false, matching GitHub's actions/runner. Enable only for jobs that need an interactive terminal; tools like docker build emit redrawing progress frames into the captured log when a TTY is present. Applies to both host and docker backends. + File string `yaml:"file"` // File specifies the file path for the runner. + Capacity int `yaml:"capacity"` // Capacity specifies the capacity of the runner. + Envs map[string]string `yaml:"envs"` // Envs stores environment variables for the runner. + EnvFile string `yaml:"env_file"` // EnvFile specifies the path to the file containing environment variables for the runner. + Timeout time.Duration `yaml:"timeout"` // Timeout specifies the duration for runner timeout. + ShutdownTimeout time.Duration `yaml:"shutdown_timeout"` // ShutdownTimeout specifies the duration to wait for running jobs to complete during a shutdown of the runner. + Insecure bool `yaml:"insecure"` // Insecure indicates whether the runner operates in an insecure mode. + FetchTimeout time.Duration `yaml:"fetch_timeout"` // FetchTimeout specifies the timeout duration for fetching resources. + FetchInterval time.Duration `yaml:"fetch_interval"` // FetchInterval specifies the interval duration for fetching resources. + FetchIntervalMax time.Duration `yaml:"fetch_interval_max"` // FetchIntervalMax specifies the maximum backoff interval when idle. + WorkdirCleanupAge time.Duration `yaml:"workdir_cleanup_age"` // WorkdirCleanupAge removes stale bind-workdir task directories and orphaned host-mode scratch dirs older than this duration during idle cleanup. + IdleCleanupInterval time.Duration `yaml:"idle_cleanup_interval"` // IdleCleanupInterval runs stale-directory cleanup periodically while the runner is idle. Set to 0 to disable cleanup cadence. + LogReportInterval time.Duration `yaml:"log_report_interval"` // LogReportInterval specifies the base interval for periodic log flush. + LogReportMaxLatency time.Duration `yaml:"log_report_max_latency"` // LogReportMaxLatency specifies the max time a log row can wait before being sent. + LogReportBatchSize int `yaml:"log_report_batch_size"` // LogReportBatchSize triggers immediate log flush when buffer reaches this size. + StateReportInterval time.Duration `yaml:"state_report_interval"` // StateReportInterval specifies the interval for state reporting. + ReportCloseTimeout time.Duration `yaml:"report_close_timeout"` // ReportCloseTimeout caps each RPC attempt when flushing the final logs and task state at job completion, on a detached context so a server cancel can't block the acknowledgement. + Labels []string `yaml:"labels"` // Labels specify the labels of the runner. Labels are declared on each startup + GithubMirror string `yaml:"github_mirror"` // GithubMirror defines what mirrors should be used when using github + AllocatePTY bool `yaml:"allocate_pty"` // AllocatePTY allocates a pseudo-TTY for each step's process. Default is false, matching GitHub's actions/runner. Enable only for jobs that need an interactive terminal; tools like docker build emit redrawing progress frames into the captured log when a TTY is present. Applies to both host and docker backends. + PostTaskScript string `yaml:"post_task_script"` // PostTaskScript is the path to an executable script run on the host after each task's cleanup completes. Empty disables the hook. On Windows use .exe/.bat/.cmd; PowerShell (.ps1) is not supported yet as the configured path. + PostTaskScriptTimeout time.Duration `yaml:"post_task_script_timeout"` // PostTaskScriptTimeout caps how long the post-task script may run. Default is 5m when post_task_script is set. } // Cache represents the configuration for caching. @@ -193,6 +201,9 @@ func LoadDefault(file string) (*Config, error) { if cfg.Runner.ReportCloseTimeout <= 0 { cfg.Runner.ReportCloseTimeout = 10 * time.Second } + if cfg.Runner.PostTaskScript != "" && cfg.Runner.PostTaskScriptTimeout <= 0 { + cfg.Runner.PostTaskScriptTimeout = DefaultPostTaskScriptTimeout + } if cfg.Metrics.Addr == "" { cfg.Metrics.Addr = "127.0.0.1:9101" } diff --git a/internal/pkg/config/config_test.go b/internal/pkg/config/config_test.go index 1005a423..382aad48 100644 --- a/internal/pkg/config/config_test.go +++ b/internal/pkg/config/config_test.go @@ -107,6 +107,34 @@ runner: // TestLoadDefault_MalformedYAMLReturnsParseError pins the error surfaced for // invalid YAML to the canonical "parse config file" message rather than the // "for defaults metadata" variant — i.e. the main yaml.Unmarshal runs first. +func TestLoadDefault_LoadsPostTaskScript(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "config.yaml") + require.NoError(t, os.WriteFile(path, []byte(` +runner: + post_task_script: /usr/local/bin/post-task.sh + post_task_script_timeout: 2m +`), 0o600)) + + cfg, err := LoadDefault(path) + require.NoError(t, err) + assert.Equal(t, "/usr/local/bin/post-task.sh", cfg.Runner.PostTaskScript) + assert.Equal(t, 2*time.Minute, cfg.Runner.PostTaskScriptTimeout) +} + +func TestLoadDefault_DefaultsPostTaskScriptTimeout(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "config.yaml") + require.NoError(t, os.WriteFile(path, []byte(` +runner: + post_task_script: /usr/local/bin/post-task.sh +`), 0o600)) + + cfg, err := LoadDefault(path) + require.NoError(t, err) + assert.Equal(t, 5*time.Minute, cfg.Runner.PostTaskScriptTimeout) +} + func TestLoadDefault_MalformedYAMLReturnsParseError(t *testing.T) { dir := t.TempDir() path := filepath.Join(dir, "config.yaml") diff --git a/internal/pkg/process/killer_plan9.go b/internal/pkg/process/killer_plan9.go new file mode 100644 index 00000000..1249208e --- /dev/null +++ b/internal/pkg/process/killer_plan9.go @@ -0,0 +1,29 @@ +// Copyright 2026 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +//go:build plan9 + +package process + +import "os" + +// Killer falls back to single-process termination on platforms without a +// process-group / Job Object tree-kill. The Job Object (Windows) and process +// group (Unix) based tree-kills live in killer_windows.go / killer_unix.go; +// here we just kill the direct child, matching the previous default behaviour. +type Killer struct { + p *os.Process +} + +func NewKiller(p *os.Process) (*Killer, error) { + return &Killer{p: p}, nil +} + +func (k *Killer) Kill() error { + if k == nil || k.p == nil { + return nil + } + return k.p.Kill() +} + +func (k *Killer) Close() error { return nil } diff --git a/internal/pkg/process/killer_unix.go b/internal/pkg/process/killer_unix.go new file mode 100644 index 00000000..ef808415 --- /dev/null +++ b/internal/pkg/process/killer_unix.go @@ -0,0 +1,56 @@ +// Copyright 2026 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +//go:build !windows && !plan9 + +package process + +import ( + "errors" + "os" + "syscall" +) + +// Killer terminates a started process together with its whole process group, +// which is the Unix counterpart of the Windows Job Object tree-kill. +// +// Background: a process (a step or a post-task script) often launches a process +// tree (a shell that starts a child which in turn spawns further background +// processes). The default exec.CommandContext cancellation only kills the +// direct child, so cancelling left the rest of the tree running. Because those +// orphans inherited the parent's stdout/stderr pipe, cmd.Wait() also blocked +// forever and the runner hung. +// +// Processes are started with Setpgid (or Setsid for the PTY path, see +// SysProcAttr), which makes the process the leader of a new process group whose +// ID equals its PID. Signalling the negative PID delivers to every process +// still in that group, so we can tear down the whole tree atomically on +// cancellation, which also closes the inherited pipe handles so cmd.Wait() can +// return. +type Killer struct { + pgid int +} + +// NewKiller captures the process group of p (an already-started process). +// Because the process is launched with Setpgid/Setsid, p is a group leader and +// its PGID equals its PID; children spawned afterwards stay in the same group +// unless they explicitly create their own. +func NewKiller(p *os.Process) (*Killer, error) { + return &Killer{pgid: p.Pid}, nil +} + +// Kill sends SIGKILL to the entire process group (the process and every +// descendant that stayed in the group). A missing group (ESRCH) means the +// processes already exited and is not treated as an error. +func (k *Killer) Kill() error { + if k == nil || k.pgid <= 0 { + return nil + } + if err := syscall.Kill(-k.pgid, syscall.SIGKILL); err != nil && !errors.Is(err, syscall.ESRCH) { + return err + } + return nil +} + +// Close is a no-op on Unix; there is no job handle to release. +func (k *Killer) Close() error { return nil } diff --git a/act/container/process_unix_test.go b/internal/pkg/process/killer_unix_test.go similarity index 85% rename from act/container/process_unix_test.go rename to internal/pkg/process/killer_unix_test.go index 02315592..0caa8ebc 100644 --- a/act/container/process_unix_test.go +++ b/internal/pkg/process/killer_unix_test.go @@ -3,7 +3,7 @@ //go:build !windows && !plan9 -package container +package process import ( "fmt" @@ -47,11 +47,12 @@ func processAlive(pid int) bool { return true } -// TestProcessKillerKillsTree verifies that a process group captured by the -// killer is terminated together with a child the step spawns afterwards. This -// mirrors a step that launches a child which spawns further processes, where -// cancelling the job must take down the whole tree, not just the direct child. -func TestProcessKillerKillsTree(t *testing.T) { +// TestKillerKillsTree verifies that a process group captured by the killer is +// terminated together with a child the process spawns afterwards. This mirrors +// a step or post-task script that launches a child which spawns further +// processes, where cancelling must take down the whole tree, not just the +// direct child. +func TestKillerKillsTree(t *testing.T) { dir := t.TempDir() pidFile := filepath.Join(dir, "child.pid") @@ -60,8 +61,8 @@ func TestProcessKillerKillsTree(t *testing.T) { // child stays in the parent's process group, so the group kill must reach it. script := fmt.Sprintf(`sleep 600 & echo $! > %q; sleep 600`, pidFile) cmd := exec.Command("/bin/sh", "-c", script) - // Launch as its own process-group leader, exactly like a real step does (see - // getSysProcAttr), so the killer's PGID == the process PID. + // Launch as its own process-group leader, exactly like a real process does + // (see SysProcAttr), so the killer's PGID == the process PID. cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true} require.NoError(t, cmd.Start()) t.Cleanup(func() { @@ -69,7 +70,7 @@ func TestProcessKillerKillsTree(t *testing.T) { _ = cmd.Wait() }) - killer, err := newProcessKiller(cmd.Process) + killer, err := NewKiller(cmd.Process) require.NoError(t, err) defer killer.Close() diff --git a/internal/pkg/process/killer_windows.go b/internal/pkg/process/killer_windows.go new file mode 100644 index 00000000..81a15303 --- /dev/null +++ b/internal/pkg/process/killer_windows.go @@ -0,0 +1,72 @@ +// Copyright 2026 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package process + +import ( + "os" + + "golang.org/x/sys/windows" +) + +// Killer terminates a started process together with its entire descendant tree +// via a Windows Job Object. +// +// Background: a process (a step or a post-task script) often launches a process +// tree (a shell that starts a child which in turn spawns further GUI or +// background processes). The default exec.CommandContext cancellation only kills +// the direct child, so cancelling left the rest of the tree running. Because +// those orphans inherited the parent's stdout/stderr pipe, cmd.Wait() also +// blocked forever and the runner hung. +// +// Assigning the process to a Job Object lets us kill the whole tree atomically +// on cancellation (TerminateJobObject), which also closes the inherited pipe +// handles so cmd.Wait() can return. +type Killer struct { + job windows.Handle +} + +// NewKiller creates a Job Object and assigns p (an already-started process) to +// it. Children spawned by p afterwards are automatically part of the job. The +// job does NOT use JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE, so closing the handle on +// normal completion does not kill legitimate background processes; the tree is +// only torn down by an explicit Kill (cancellation). +func NewKiller(p *os.Process) (*Killer, error) { + job, err := windows.CreateJobObject(nil, nil) + if err != nil { + return nil, err + } + + h, err := windows.OpenProcess(windows.PROCESS_SET_QUOTA|windows.PROCESS_TERMINATE, false, uint32(p.Pid)) + if err != nil { + windows.CloseHandle(job) + return nil, err + } + defer windows.CloseHandle(h) + + if err := windows.AssignProcessToJobObject(job, h); err != nil { + windows.CloseHandle(job) + return nil, err + } + + return &Killer{job: job}, nil +} + +// Kill terminates every process currently assigned to the job (the process and +// all of its descendants). +func (k *Killer) Kill() error { + if k == nil || k.job == 0 { + return nil + } + return windows.TerminateJobObject(k.job, 1) +} + +// Close releases the job handle. It does not terminate the processes. +func (k *Killer) Close() error { + if k == nil || k.job == 0 { + return nil + } + h := k.job + k.job = 0 + return windows.CloseHandle(h) +} diff --git a/act/container/process_windows_test.go b/internal/pkg/process/killer_windows_test.go similarity index 83% rename from act/container/process_windows_test.go rename to internal/pkg/process/killer_windows_test.go index c06cfbf8..647e5032 100644 --- a/act/container/process_windows_test.go +++ b/internal/pkg/process/killer_windows_test.go @@ -1,7 +1,7 @@ // Copyright 2026 The Gitea Authors. All rights reserved. // SPDX-License-Identifier: MIT -package container +package process import ( "fmt" @@ -32,11 +32,11 @@ func processAlive(pid int) bool { return code == stillActive } -// TestProcessKillerKillsTree verifies that a process assigned to the Job Object -// is terminated together with a child it spawns afterwards. This mirrors a step -// that launches a child which spawns further processes, where cancelling the -// job must take down the whole tree, not just the direct child. -func TestProcessKillerKillsTree(t *testing.T) { +// TestKillerKillsTree verifies that a process assigned to the Job Object is +// terminated together with a child it spawns afterwards. This mirrors a step or +// post-task script that launches a child which spawns further processes, where +// cancelling must take down the whole tree, not just the direct child. +func TestKillerKillsTree(t *testing.T) { dir := t.TempDir() pidFile := filepath.Join(dir, "child.pid") @@ -50,7 +50,7 @@ func TestProcessKillerKillsTree(t *testing.T) { require.NoError(t, cmd.Start()) t.Cleanup(func() { _ = cmd.Process.Kill() }) - killer, err := newProcessKiller(cmd.Process) + killer, err := NewKiller(cmd.Process) require.NoError(t, err) defer killer.Close() diff --git a/internal/pkg/process/sysprocattr_plan9.go b/internal/pkg/process/sysprocattr_plan9.go new file mode 100644 index 00000000..1038bfb7 --- /dev/null +++ b/internal/pkg/process/sysprocattr_plan9.go @@ -0,0 +1,17 @@ +// Copyright 2026 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +//go:build plan9 + +package process + +import "syscall" + +// SysProcAttr returns the platform attributes used to start a process. Plan 9 +// has no process-group tree-kill (see Killer), so we only request a new rfork +// note group here. +func SysProcAttr(cmdLine string, tty bool) *syscall.SysProcAttr { + return &syscall.SysProcAttr{ + Rfork: syscall.RFNOTEG, + } +} diff --git a/internal/pkg/process/sysprocattr_unix.go b/internal/pkg/process/sysprocattr_unix.go new file mode 100644 index 00000000..ddf24da2 --- /dev/null +++ b/internal/pkg/process/sysprocattr_unix.go @@ -0,0 +1,24 @@ +// Copyright 2026 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +//go:build !windows && !plan9 + +package process + +import "syscall" + +// SysProcAttr returns the platform attributes used to start a process so that a +// Killer can later tear down its whole process tree. On Unix the process becomes +// the leader of a new process group (or session, for the PTY path), so a +// signal to the negative PID reaches every descendant that stayed in the group. +func SysProcAttr(_ string, tty bool) *syscall.SysProcAttr { + if tty { + return &syscall.SysProcAttr{ + Setsid: true, + Setctty: true, + } + } + return &syscall.SysProcAttr{ + Setpgid: true, + } +} diff --git a/internal/pkg/process/sysprocattr_windows.go b/internal/pkg/process/sysprocattr_windows.go new file mode 100644 index 00000000..daf329d2 --- /dev/null +++ b/internal/pkg/process/sysprocattr_windows.go @@ -0,0 +1,14 @@ +// Copyright 2026 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package process + +import "syscall" + +// SysProcAttr returns the platform attributes used to start a process so that a +// Killer can later tear down its whole process tree. On Windows the process is +// placed in a new process group; the descendant tree is reclaimed via the Job +// Object set up by NewKiller. +func SysProcAttr(cmdLine string, tty bool) *syscall.SysProcAttr { + return &syscall.SysProcAttr{CmdLine: cmdLine, CreationFlags: syscall.CREATE_NEW_PROCESS_GROUP} +} diff --git a/internal/pkg/process/treekill.go b/internal/pkg/process/treekill.go new file mode 100644 index 00000000..bd478c14 --- /dev/null +++ b/internal/pkg/process/treekill.go @@ -0,0 +1,66 @@ +// Copyright 2026 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package process + +import ( + "os" + "os/exec" + "sync/atomic" + "time" +) + +// treeKillWaitDelay bounds how long Wait lingers for the command's I/O pipes to +// drain after the process exits before force-closing them and returning. It also +// covers a command that backgrounds a process holding a pipe open after a clean +// exit. +const treeKillWaitDelay = 10 * time.Second + +// TreeKill wires an exec.Cmd so that cancelling it tears down the command's +// whole process tree (see Killer) rather than only the direct child, and bounds +// the post-exit I/O wait so a leftover pipe writer can never hang cmd.Wait. +// +// Background: a command often launches a process tree (a shell that starts a +// child which spawns further background processes). The default +// exec.CommandContext cancellation only kills the direct child, leaving the rest +// of the tree running; and because the orphans inherit cmd's stdout/stderr pipe, +// cmd.Wait() would block forever, hanging the caller. +// +// Callers still set cmd.SysProcAttr (via SysProcAttr) themselves, because the +// value differs between the plain and PTY execution paths. +type TreeKill struct { + killer atomic.Pointer[Killer] +} + +// NewTreeKill sets cmd.Cancel and cmd.WaitDelay. Call it before cmd.Start, then +// call Capture once after a successful Start. +func NewTreeKill(cmd *exec.Cmd) *TreeKill { + t := &TreeKill{} + cmd.Cancel = func() error { + if k := t.killer.Load(); k != nil { + return k.Kill() + } + if cmd.Process != nil { + return cmd.Process.Kill() + } + return nil + } + cmd.WaitDelay = treeKillWaitDelay + return t +} + +// Capture assigns the started process (and the descendants it spawns) to a +// Killer so cancellation can reach the whole tree — a Job Object on Windows +// (children spawned afterwards are auto-included) and the process group on Unix. +// Call it once after cmd.Start. On failure the command falls back to the default +// single-process kill and the returned error is for logging only; WaitDelay +// still bounds the wait. The returned Killer should be closed when the command +// finishes (Close is nil-safe). +func (t *TreeKill) Capture(p *os.Process) (*Killer, error) { + k, err := NewKiller(p) + if err != nil { + return nil, err + } + t.killer.Store(k) + return k, nil +} diff --git a/internal/pkg/report/reporter.go b/internal/pkg/report/reporter.go index 6cc0c353..f25790bd 100644 --- a/internal/pkg/report/reporter.go +++ b/internal/pkg/report/reporter.go @@ -44,11 +44,13 @@ type Reporter struct { // so the gauge skips no-op Set calls when the buffer size is unchanged. lastLogBufferRows int - state *runnerv1.TaskState - stateChanged bool - stateMu sync.RWMutex - outputs sync.Map - daemon chan struct{} + state *runnerv1.TaskState + stateChanged bool + stateMu sync.RWMutex + outputs sync.Map + daemon chan struct{} + heartbeatStop chan struct{} + heartbeatStopOnce sync.Once // Unix-nanos of the last successful UpdateTask. Atomic so the heartbeat // guard in ReportState reads it without contending stateMu. @@ -99,7 +101,8 @@ func NewReporter(ctx context.Context, cancel context.CancelFunc, client client.C state: &runnerv1.TaskState{ Id: task.Id, }, - daemon: make(chan struct{}), + daemon: make(chan struct{}), + heartbeatStop: make(chan struct{}), } if task.Secrets["ACTIONS_STEP_DEBUG"] == "true" { @@ -273,6 +276,15 @@ func (r *Reporter) RunDaemon() { go r.runDaemonLoop() } +// StopHeartbeats stops periodic UpdateTask heartbeats without cancelling the +// task context. Close() still delivers the final flush. Safe to call multiple +// times and when the context is already cancelled. +func (r *Reporter) StopHeartbeats() { + r.heartbeatStopOnce.Do(func() { + close(r.heartbeatStop) + }) +} + func (r *Reporter) stopLatencyTimer(active *bool, timer *time.Timer) { if *active { if !timer.Stop() { @@ -339,6 +351,12 @@ func (r *Reporter) runDaemonLoop() { // delivers the final flush on a detached context (flushFinal). close(r.daemon) return + + case <-r.heartbeatStop: + // Stop heartbeating during post-task script execution. Close() still + // delivers the final flush on a detached context (flushFinal). + close(r.daemon) + return } r.stateMu.RLock() diff --git a/internal/pkg/report/reporter_test.go b/internal/pkg/report/reporter_test.go index 606e1eaa..c0da9dfe 100644 --- a/internal/pkg/report/reporter_test.go +++ b/internal/pkg/report/reporter_test.go @@ -921,3 +921,65 @@ func TestReporter_CloseReportsCancelledOnCanceledCtx(t *testing.T) { assert.True(t, foundCancelled, "final log must contain a 'Cancelled' row") assert.False(t, foundEarlyTermination, "final log must not contain 'Early termination' on the cancel path") } + +// TestReporter_StopHeartbeats verifies that StopHeartbeats ends periodic +// UpdateTask heartbeats while Close() still flushes the final state. +func TestReporter_StopHeartbeats(t *testing.T) { + var updateTaskCalls atomic.Int64 + + client := mocks.NewClient(t) + client.On("UpdateLog", mock.Anything, mock.Anything).Maybe().Return( + func(_ context.Context, req *connect_go.Request[runnerv1.UpdateLogRequest]) (*connect_go.Response[runnerv1.UpdateLogResponse], error) { + return connect_go.NewResponse(&runnerv1.UpdateLogResponse{ + AckIndex: req.Msg.Index + int64(len(req.Msg.Rows)), + }), nil + }, + ) + client.On("UpdateTask", mock.Anything, mock.Anything).Return( + func(_ context.Context, _ *connect_go.Request[runnerv1.UpdateTaskRequest]) (*connect_go.Response[runnerv1.UpdateTaskResponse], error) { + updateTaskCalls.Add(1) + return connect_go.NewResponse(&runnerv1.UpdateTaskResponse{}), nil + }, + ) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + taskCtx, err := structpb.NewStruct(map[string]any{}) + require.NoError(t, err) + + cfg, err := config.LoadDefault("") + require.NoError(t, err) + cfg.Runner.StateReportInterval = 20 * time.Millisecond + cfg.Runner.LogReportInterval = time.Hour + + reporter := NewReporter(ctx, cancel, client, &runnerv1.Task{Context: taskCtx}, cfg) + reporter.ResetSteps(1) + reporter.RunDaemon() + + reporter.stateMu.Lock() + reporter.stateChanged = true + reporter.state.Result = runnerv1.Result_RESULT_SUCCESS + reporter.state.StoppedAt = timestamppb.Now() + reporter.stateMu.Unlock() + + require.Eventually(t, func() bool { + return updateTaskCalls.Load() >= 1 + }, time.Second, 5*time.Millisecond, "daemon must send at least one UpdateTask before StopHeartbeats") + + beforeStop := updateTaskCalls.Load() + reporter.StopHeartbeats() + + select { + case <-reporter.daemon: + case <-time.After(time.Second): + t.Fatal("StopHeartbeats must stop the daemon loop") + } + + time.Sleep(3 * cfg.Runner.StateReportInterval) + assert.Equal(t, beforeStop, updateTaskCalls.Load(), + "UpdateTask must not be called after StopHeartbeats") + + require.NoError(t, reporter.Close("")) + assert.Greater(t, updateTaskCalls.Load(), beforeStop, + "Close() must still send a final UpdateTask after StopHeartbeats") +}