mirror of
https://gitea.com/gitea/act_runner.git
synced 2026-06-15 14:24:22 +02:00
fix(cleanup): kill Unix step process group on cancel to avoid hang (#1025)
Cancelling a job on a Linux/macOS host runner can leave the spawned process tree running and hang the runner — the same failure mode fixed for Windows in #1011, just on the other platforms. Steps are launched as process-group leaders (`Setpgid`, or `Setsid` for the PTY path), but the default `exec.CommandContext` cancellation only kills the **direct child**. When a step launches a shell that starts a child which in turn spawns further background processes, cancelling the job leaves the descendants running. Because those orphans inherited the step's stdout/stderr pipe, the read end never hits EOF and `cmd.Wait()` blocks forever. Because the step executor never returns: - the orphaned processes keep running (the cancelled work is not actually stopped), and - end-of-job cleanup is never reached, so the runner appears to go offline / stop picking up jobs. ## Fix Apply the same tree-kill approach as Windows, using the Unix counterpart of a Job Object: the **process group**. - Add a Unix `processKiller` (`process_unix.go`) that captures the step's PGID (== PID, since the step is launched as a group leader) and sends `SIGKILL` to the whole group on cancellation. This also closes the inherited pipe handles so `cmd.Wait()` can return. `ESRCH` (group already gone) is not treated as an error. - Restrict the previous no-op stub (`process_other.go`) to `plan9` and have it fall back to a single-process kill, preserving plan9's prior behaviour. - Wire `cmd.Cancel` (tree kill) and `cmd.WaitDelay` (10s) **unconditionally** in `exec()` instead of Windows-only. `WaitDelay` also covers a step that backgrounds a process holding the pipe open after the main process exits. Reviewed-on: https://gitea.com/gitea/runner/pulls/1025 Reviewed-by: Zettat123 <39446+zettat123@noreply.gitea.com>
This commit is contained in:
@@ -323,28 +323,28 @@ func (e *HostEnvironment) exec(ctx context.Context, command []string, cmdline st
|
||||
cmd.Dir = wd
|
||||
cmd.SysProcAttr = getSysProcAttr(cmdline, false)
|
||||
|
||||
// On Windows a step often launches a process tree (a shell that starts a
|
||||
// child which spawns further GUI or background processes). The default
|
||||
// context cancellation only kills the direct child, leaving the rest of the
|
||||
// tree running; and because the orphans inherit cmd's stdout/stderr pipe,
|
||||
// cmd.Wait() would block forever, hanging the runner. Kill the whole tree
|
||||
// via a Job Object on cancellation, and bound the wait so a leftover pipe
|
||||
// writer can never hang Wait indefinitely.
|
||||
// A step often launches a process tree (a shell that starts a child which
|
||||
// spawns further background or GUI processes). The default context
|
||||
// cancellation only kills the direct child, leaving the rest of the tree
|
||||
// running; and because the orphans inherit cmd's stdout/stderr pipe,
|
||||
// cmd.Wait() would block forever, hanging the runner. Kill the whole tree on
|
||||
// cancellation — via a Job Object on Windows and the process group on Unix
|
||||
// (see processKiller) — and bound the wait so a leftover pipe writer can
|
||||
// never hang Wait indefinitely.
|
||||
var killer atomic.Pointer[processKiller]
|
||||
if runtime.GOOS == "windows" {
|
||||
cmd.Cancel = func() error {
|
||||
if k := killer.Load(); k != nil {
|
||||
return k.Kill()
|
||||
}
|
||||
if cmd.Process != nil {
|
||||
return cmd.Process.Kill()
|
||||
}
|
||||
return nil
|
||||
cmd.Cancel = func() error {
|
||||
if k := killer.Load(); k != nil {
|
||||
return k.Kill()
|
||||
}
|
||||
// Once the step process has exited, give its I/O pipes at most this long
|
||||
// to drain before Wait force-closes them and returns (Go's WaitDelay).
|
||||
cmd.WaitDelay = 10 * time.Second
|
||||
if cmd.Process != nil {
|
||||
return cmd.Process.Kill()
|
||||
}
|
||||
return nil
|
||||
}
|
||||
// Once the step process has exited, give its I/O pipes at most this long to
|
||||
// drain before Wait force-closes them and returns (Go's WaitDelay). This
|
||||
// also covers a step that backgrounds a process holding the pipe open.
|
||||
cmd.WaitDelay = 10 * time.Second
|
||||
|
||||
var ppty *os.File
|
||||
var tty *os.File
|
||||
@@ -375,17 +375,16 @@ func (e *HostEnvironment) exec(ctx context.Context, command []string, cmdline st
|
||||
if err := cmd.Start(); err != nil {
|
||||
return err
|
||||
}
|
||||
if runtime.GOOS == "windows" {
|
||||
// Assign the started process to a Job Object so cmd.Cancel can kill the
|
||||
// whole descendant tree. Children spawned afterwards are auto-included.
|
||||
// On failure (e.g. nested-job restrictions) we fall back to the default
|
||||
// single-process kill; WaitDelay + end-of-job cleanup still apply.
|
||||
if k, kerr := newProcessKiller(cmd.Process); kerr != nil {
|
||||
common.Logger(ctx).Warnf("process tree kill setup failed, falling back to single-process kill: %v", kerr)
|
||||
} else {
|
||||
killer.Store(k)
|
||||
defer k.Close()
|
||||
}
|
||||
// Capture the started process for tree-kill on cancellation: a Job Object on
|
||||
// Windows (children spawned afterwards are auto-included) and the process
|
||||
// group on Unix. On failure (e.g. Windows nested-job restrictions) we fall
|
||||
// back to the default single-process kill; WaitDelay + end-of-job cleanup
|
||||
// still apply.
|
||||
if k, kerr := newProcessKiller(cmd.Process); kerr != nil {
|
||||
common.Logger(ctx).Warnf("process tree kill setup failed, falling back to single-process kill: %v", kerr)
|
||||
} else {
|
||||
killer.Store(k)
|
||||
defer k.Close()
|
||||
}
|
||||
err = cmd.Wait()
|
||||
if err != nil {
|
||||
|
||||
Reference in New Issue
Block a user