feat: Add optional runner.post_task_script hook after task cleanup (#1026)

- Adds `runner.post_task_script` and `runner.post_task_script_timeout` (default `5m`) to run a host executable after each task’s built-in cleanup (post-steps, container teardown, bind-workdir removal).
- Stops task heartbeats via `Reporter.StopHeartbeats()` while the script runs so Gitea won’t assign overlapping work; the final task acknowledgement still happens in `reporter.Close()`.
- Script output goes to the runner process log; non-zero exits are warned only and do not change the job result.
- Documents lifecycle, offline behavior, timeouts, and Windows limits (`.ps1` not supported yet) in `docs/post-task-script.md`.

Reviewed-on: https://gitea.com/gitea/runner/pulls/1026
Reviewed-by: Zettat123 <39446+zettat123@noreply.gitea.com>
This commit is contained in:
Nicolas
2026-06-19 19:28:10 +00:00
parent df0370f8bf
commit 007717956a
28 changed files with 922 additions and 263 deletions

View File

@@ -0,0 +1,29 @@
// Copyright 2026 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
//go:build plan9
package process
import "os"
// Killer falls back to single-process termination on platforms without a
// process-group / Job Object tree-kill. The Job Object (Windows) and process
// group (Unix) based tree-kills live in killer_windows.go / killer_unix.go;
// here we just kill the direct child, matching the previous default behaviour.
type Killer struct {
p *os.Process
}
func NewKiller(p *os.Process) (*Killer, error) {
return &Killer{p: p}, nil
}
func (k *Killer) Kill() error {
if k == nil || k.p == nil {
return nil
}
return k.p.Kill()
}
func (k *Killer) Close() error { return nil }

View File

@@ -0,0 +1,56 @@
// Copyright 2026 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
//go:build !windows && !plan9
package process
import (
"errors"
"os"
"syscall"
)
// Killer terminates a started process together with its whole process group,
// which is the Unix counterpart of the Windows Job Object tree-kill.
//
// Background: a process (a step or a post-task script) often launches a process
// tree (a shell that starts a child which in turn spawns further background
// processes). The default exec.CommandContext cancellation only kills the
// direct child, so cancelling left the rest of the tree running. Because those
// orphans inherited the parent's stdout/stderr pipe, cmd.Wait() also blocked
// forever and the runner hung.
//
// Processes are started with Setpgid (or Setsid for the PTY path, see
// SysProcAttr), which makes the process the leader of a new process group whose
// ID equals its PID. Signalling the negative PID delivers to every process
// still in that group, so we can tear down the whole tree atomically on
// cancellation, which also closes the inherited pipe handles so cmd.Wait() can
// return.
type Killer struct {
pgid int
}
// NewKiller captures the process group of p (an already-started process).
// Because the process is launched with Setpgid/Setsid, p is a group leader and
// its PGID equals its PID; children spawned afterwards stay in the same group
// unless they explicitly create their own.
func NewKiller(p *os.Process) (*Killer, error) {
return &Killer{pgid: p.Pid}, nil
}
// Kill sends SIGKILL to the entire process group (the process and every
// descendant that stayed in the group). A missing group (ESRCH) means the
// processes already exited and is not treated as an error.
func (k *Killer) Kill() error {
if k == nil || k.pgid <= 0 {
return nil
}
if err := syscall.Kill(-k.pgid, syscall.SIGKILL); err != nil && !errors.Is(err, syscall.ESRCH) {
return err
}
return nil
}
// Close is a no-op on Unix; there is no job handle to release.
func (k *Killer) Close() error { return nil }

View File

@@ -0,0 +1,101 @@
// Copyright 2026 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
//go:build !windows && !plan9
package process
import (
"fmt"
"os"
"os/exec"
"path/filepath"
"strconv"
"strings"
"syscall"
"testing"
"time"
"github.com/stretchr/testify/require"
)
// processAlive reports whether pid refers to a still-running process. Signal 0
// performs error checking without delivering a signal: a nil error (or EPERM)
// means the process exists, ESRCH means it is gone.
//
// On Linux, zombie processes (state Z in /proc/<pid>/stat) appear alive to
// kill(0) but have already terminated — their corpse lingers until the parent
// calls wait(). In a Docker container the child may be reparented to a PID 1
// that does not reap promptly, so we treat zombies as not alive.
func processAlive(pid int) bool {
err := syscall.Kill(pid, 0)
if err != nil {
return false
}
// On Linux /proc is available; check whether the process is a zombie.
if b, readErr := os.ReadFile(fmt.Sprintf("/proc/%d/stat", pid)); readErr == nil {
// Format: "pid (comm) state ..." — state follows the closing ')' of the
// command name (which may itself contain spaces and parens).
rest := string(b)
if idx := strings.LastIndex(rest, ") "); idx >= 0 {
fields := strings.Fields(rest[idx+2:])
if len(fields) > 0 && fields[0] == "Z" {
return false // zombie: terminated but not yet reaped
}
}
}
return true
}
// TestKillerKillsTree verifies that a process group captured by the killer is
// terminated together with a child the process spawns afterwards. This mirrors
// a step or post-task script that launches a child which spawns further
// processes, where cancelling must take down the whole tree, not just the
// direct child.
func TestKillerKillsTree(t *testing.T) {
dir := t.TempDir()
pidFile := filepath.Join(dir, "child.pid")
// Parent shell backgrounds a long-lived child (writing its PID to a file)
// and then sleeps. With job control off (non-interactive sh) the backgrounded
// child stays in the parent's process group, so the group kill must reach it.
script := fmt.Sprintf(`sleep 600 & echo $! > %q; sleep 600`, pidFile)
cmd := exec.Command("/bin/sh", "-c", script)
// Launch as its own process-group leader, exactly like a real process does
// (see SysProcAttr), so the killer's PGID == the process PID.
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
require.NoError(t, cmd.Start())
t.Cleanup(func() {
_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
_ = cmd.Wait()
})
killer, err := NewKiller(cmd.Process)
require.NoError(t, err)
defer killer.Close()
// Wait for the backgrounded child PID to be reported.
var childPID int
require.Eventually(t, func() bool {
b, e := os.ReadFile(pidFile)
if e != nil {
return false
}
s := strings.TrimSpace(string(b))
if s == "" {
return false
}
childPID, _ = strconv.Atoi(s)
return childPID > 0 && processAlive(childPID)
}, 20*time.Second, 100*time.Millisecond, "child process should start")
// Killing the group must terminate both the parent and the backgrounded child.
require.NoError(t, killer.Kill())
// Reap the parent so it does not linger as a zombie (which would still report
// as alive); SIGKILL makes Wait return promptly.
_ = cmd.Wait()
require.Eventually(t, func() bool {
return !processAlive(childPID)
}, 20*time.Second, 100*time.Millisecond, "backgrounded child should be terminated")
}

View File

@@ -0,0 +1,72 @@
// Copyright 2026 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package process
import (
"os"
"golang.org/x/sys/windows"
)
// Killer terminates a started process together with its entire descendant tree
// via a Windows Job Object.
//
// Background: a process (a step or a post-task script) often launches a process
// tree (a shell that starts a child which in turn spawns further GUI or
// background processes). The default exec.CommandContext cancellation only kills
// the direct child, so cancelling left the rest of the tree running. Because
// those orphans inherited the parent's stdout/stderr pipe, cmd.Wait() also
// blocked forever and the runner hung.
//
// Assigning the process to a Job Object lets us kill the whole tree atomically
// on cancellation (TerminateJobObject), which also closes the inherited pipe
// handles so cmd.Wait() can return.
type Killer struct {
job windows.Handle
}
// NewKiller creates a Job Object and assigns p (an already-started process) to
// it. Children spawned by p afterwards are automatically part of the job. The
// job does NOT use JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE, so closing the handle on
// normal completion does not kill legitimate background processes; the tree is
// only torn down by an explicit Kill (cancellation).
func NewKiller(p *os.Process) (*Killer, error) {
job, err := windows.CreateJobObject(nil, nil)
if err != nil {
return nil, err
}
h, err := windows.OpenProcess(windows.PROCESS_SET_QUOTA|windows.PROCESS_TERMINATE, false, uint32(p.Pid))
if err != nil {
windows.CloseHandle(job)
return nil, err
}
defer windows.CloseHandle(h)
if err := windows.AssignProcessToJobObject(job, h); err != nil {
windows.CloseHandle(job)
return nil, err
}
return &Killer{job: job}, nil
}
// Kill terminates every process currently assigned to the job (the process and
// all of its descendants).
func (k *Killer) Kill() error {
if k == nil || k.job == 0 {
return nil
}
return windows.TerminateJobObject(k.job, 1)
}
// Close releases the job handle. It does not terminate the processes.
func (k *Killer) Close() error {
if k == nil || k.job == 0 {
return nil
}
h := k.job
k.job = 0
return windows.CloseHandle(h)
}

View File

@@ -0,0 +1,78 @@
// Copyright 2026 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package process
import (
"fmt"
"os"
"os/exec"
"path/filepath"
"strconv"
"strings"
"testing"
"time"
"github.com/stretchr/testify/require"
"golang.org/x/sys/windows"
)
// processAlive reports whether pid refers to a still-running process.
func processAlive(pid int) bool {
h, err := windows.OpenProcess(windows.PROCESS_QUERY_LIMITED_INFORMATION, false, uint32(pid))
if err != nil {
return false
}
defer windows.CloseHandle(h)
var code uint32
if err := windows.GetExitCodeProcess(h, &code); err != nil {
return false
}
const stillActive = 259 // STILL_ACTIVE
return code == stillActive
}
// TestKillerKillsTree verifies that a process assigned to the Job Object is
// terminated together with a child it spawns afterwards. This mirrors a step or
// post-task script that launches a child which spawns further processes, where
// cancelling must take down the whole tree, not just the direct child.
func TestKillerKillsTree(t *testing.T) {
dir := t.TempDir()
pidFile := filepath.Join(dir, "child.pid")
// Parent powershell spawns a detached, long-lived child powershell (writing
// its PID to a file) and then sleeps. The child is launched AFTER the parent
// has been assigned to the job, so it must be captured by the job too.
script := fmt.Sprintf(
`$c = Start-Process powershell -PassThru -ArgumentList '-NoProfile','-Command','Start-Sleep -Seconds 600'; `+
`Set-Content -LiteralPath %q -Value $c.Id; Start-Sleep -Seconds 600`, pidFile)
cmd := exec.Command("powershell.exe", "-NoProfile", "-Command", script)
require.NoError(t, cmd.Start())
t.Cleanup(func() { _ = cmd.Process.Kill() })
killer, err := NewKiller(cmd.Process)
require.NoError(t, err)
defer killer.Close()
// Wait for the child PID to be reported.
var childPID int
require.Eventually(t, func() bool {
b, e := os.ReadFile(pidFile)
if e != nil {
return false
}
s := strings.TrimSpace(string(b))
if s == "" {
return false
}
childPID, _ = strconv.Atoi(s)
return childPID > 0 && processAlive(childPID)
}, 20*time.Second, 200*time.Millisecond, "child process should start")
// Killing the job must terminate both the parent and the detached child.
require.NoError(t, killer.Kill())
require.Eventually(t, func() bool {
return !processAlive(cmd.Process.Pid) && !processAlive(childPID)
}, 20*time.Second, 200*time.Millisecond, "parent and child should both be terminated")
}

View File

@@ -0,0 +1,17 @@
// Copyright 2026 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
//go:build plan9
package process
import "syscall"
// SysProcAttr returns the platform attributes used to start a process. Plan 9
// has no process-group tree-kill (see Killer), so we only request a new rfork
// note group here.
func SysProcAttr(cmdLine string, tty bool) *syscall.SysProcAttr {
return &syscall.SysProcAttr{
Rfork: syscall.RFNOTEG,
}
}

View File

@@ -0,0 +1,24 @@
// Copyright 2026 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
//go:build !windows && !plan9
package process
import "syscall"
// SysProcAttr returns the platform attributes used to start a process so that a
// Killer can later tear down its whole process tree. On Unix the process becomes
// the leader of a new process group (or session, for the PTY path), so a
// signal to the negative PID reaches every descendant that stayed in the group.
func SysProcAttr(_ string, tty bool) *syscall.SysProcAttr {
if tty {
return &syscall.SysProcAttr{
Setsid: true,
Setctty: true,
}
}
return &syscall.SysProcAttr{
Setpgid: true,
}
}

View File

@@ -0,0 +1,14 @@
// Copyright 2026 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package process
import "syscall"
// SysProcAttr returns the platform attributes used to start a process so that a
// Killer can later tear down its whole process tree. On Windows the process is
// placed in a new process group; the descendant tree is reclaimed via the Job
// Object set up by NewKiller.
func SysProcAttr(cmdLine string, tty bool) *syscall.SysProcAttr {
return &syscall.SysProcAttr{CmdLine: cmdLine, CreationFlags: syscall.CREATE_NEW_PROCESS_GROUP}
}

View File

@@ -0,0 +1,66 @@
// Copyright 2026 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package process
import (
"os"
"os/exec"
"sync/atomic"
"time"
)
// treeKillWaitDelay bounds how long Wait lingers for the command's I/O pipes to
// drain after the process exits before force-closing them and returning. It also
// covers a command that backgrounds a process holding a pipe open after a clean
// exit.
const treeKillWaitDelay = 10 * time.Second
// TreeKill wires an exec.Cmd so that cancelling it tears down the command's
// whole process tree (see Killer) rather than only the direct child, and bounds
// the post-exit I/O wait so a leftover pipe writer can never hang cmd.Wait.
//
// Background: a command often launches a process tree (a shell that starts a
// child which spawns further background processes). The default
// exec.CommandContext cancellation only kills the direct child, leaving the rest
// of the tree running; and because the orphans inherit cmd's stdout/stderr pipe,
// cmd.Wait() would block forever, hanging the caller.
//
// Callers still set cmd.SysProcAttr (via SysProcAttr) themselves, because the
// value differs between the plain and PTY execution paths.
type TreeKill struct {
killer atomic.Pointer[Killer]
}
// NewTreeKill sets cmd.Cancel and cmd.WaitDelay. Call it before cmd.Start, then
// call Capture once after a successful Start.
func NewTreeKill(cmd *exec.Cmd) *TreeKill {
t := &TreeKill{}
cmd.Cancel = func() error {
if k := t.killer.Load(); k != nil {
return k.Kill()
}
if cmd.Process != nil {
return cmd.Process.Kill()
}
return nil
}
cmd.WaitDelay = treeKillWaitDelay
return t
}
// Capture assigns the started process (and the descendants it spawns) to a
// Killer so cancellation can reach the whole tree — a Job Object on Windows
// (children spawned afterwards are auto-included) and the process group on Unix.
// Call it once after cmd.Start. On failure the command falls back to the default
// single-process kill and the returned error is for logging only; WaitDelay
// still bounds the wait. The returned Killer should be closed when the command
// finishes (Close is nil-safe).
func (t *TreeKill) Capture(p *os.Process) (*Killer, error) {
k, err := NewKiller(p)
if err != nil {
return nil, err
}
t.killer.Store(k)
return k, nil
}