mirror of
https://gitea.com/gitea/act_runner.git
synced 2026-06-15 14:24:22 +02:00
fix(cleanup): kill Unix step process group on cancel to avoid hang (#1025)
Cancelling a job on a Linux/macOS host runner can leave the spawned process tree running and hang the runner — the same failure mode fixed for Windows in #1011, just on the other platforms. Steps are launched as process-group leaders (`Setpgid`, or `Setsid` for the PTY path), but the default `exec.CommandContext` cancellation only kills the **direct child**. When a step launches a shell that starts a child which in turn spawns further background processes, cancelling the job leaves the descendants running. Because those orphans inherited the step's stdout/stderr pipe, the read end never hits EOF and `cmd.Wait()` blocks forever. Because the step executor never returns: - the orphaned processes keep running (the cancelled work is not actually stopped), and - end-of-job cleanup is never reached, so the runner appears to go offline / stop picking up jobs. ## Fix Apply the same tree-kill approach as Windows, using the Unix counterpart of a Job Object: the **process group**. - Add a Unix `processKiller` (`process_unix.go`) that captures the step's PGID (== PID, since the step is launched as a group leader) and sends `SIGKILL` to the whole group on cancellation. This also closes the inherited pipe handles so `cmd.Wait()` can return. `ESRCH` (group already gone) is not treated as an error. - Restrict the previous no-op stub (`process_other.go`) to `plan9` and have it fall back to a single-process kill, preserving plan9's prior behaviour. - Wire `cmd.Cancel` (tree kill) and `cmd.WaitDelay` (10s) **unconditionally** in `exec()` instead of Windows-only. `WaitDelay` also covers a step that backgrounds a process holding the pipe open after the main process exits. Reviewed-on: https://gitea.com/gitea/runner/pulls/1025 Reviewed-by: Zettat123 <39446+zettat123@noreply.gitea.com>
This commit is contained in:
@@ -323,15 +323,15 @@ func (e *HostEnvironment) exec(ctx context.Context, command []string, cmdline st
|
|||||||
cmd.Dir = wd
|
cmd.Dir = wd
|
||||||
cmd.SysProcAttr = getSysProcAttr(cmdline, false)
|
cmd.SysProcAttr = getSysProcAttr(cmdline, false)
|
||||||
|
|
||||||
// On Windows a step often launches a process tree (a shell that starts a
|
// A step often launches a process tree (a shell that starts a child which
|
||||||
// child which spawns further GUI or background processes). The default
|
// spawns further background or GUI processes). The default context
|
||||||
// context cancellation only kills the direct child, leaving the rest of the
|
// cancellation only kills the direct child, leaving the rest of the tree
|
||||||
// tree running; and because the orphans inherit cmd's stdout/stderr pipe,
|
// running; and because the orphans inherit cmd's stdout/stderr pipe,
|
||||||
// cmd.Wait() would block forever, hanging the runner. Kill the whole tree
|
// cmd.Wait() would block forever, hanging the runner. Kill the whole tree on
|
||||||
// via a Job Object on cancellation, and bound the wait so a leftover pipe
|
// cancellation — via a Job Object on Windows and the process group on Unix
|
||||||
// writer can never hang Wait indefinitely.
|
// (see processKiller) — and bound the wait so a leftover pipe writer can
|
||||||
|
// never hang Wait indefinitely.
|
||||||
var killer atomic.Pointer[processKiller]
|
var killer atomic.Pointer[processKiller]
|
||||||
if runtime.GOOS == "windows" {
|
|
||||||
cmd.Cancel = func() error {
|
cmd.Cancel = func() error {
|
||||||
if k := killer.Load(); k != nil {
|
if k := killer.Load(); k != nil {
|
||||||
return k.Kill()
|
return k.Kill()
|
||||||
@@ -341,10 +341,10 @@ func (e *HostEnvironment) exec(ctx context.Context, command []string, cmdline st
|
|||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
// Once the step process has exited, give its I/O pipes at most this long
|
// Once the step process has exited, give its I/O pipes at most this long to
|
||||||
// to drain before Wait force-closes them and returns (Go's WaitDelay).
|
// drain before Wait force-closes them and returns (Go's WaitDelay). This
|
||||||
|
// also covers a step that backgrounds a process holding the pipe open.
|
||||||
cmd.WaitDelay = 10 * time.Second
|
cmd.WaitDelay = 10 * time.Second
|
||||||
}
|
|
||||||
|
|
||||||
var ppty *os.File
|
var ppty *os.File
|
||||||
var tty *os.File
|
var tty *os.File
|
||||||
@@ -375,18 +375,17 @@ func (e *HostEnvironment) exec(ctx context.Context, command []string, cmdline st
|
|||||||
if err := cmd.Start(); err != nil {
|
if err := cmd.Start(); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
if runtime.GOOS == "windows" {
|
// Capture the started process for tree-kill on cancellation: a Job Object on
|
||||||
// Assign the started process to a Job Object so cmd.Cancel can kill the
|
// Windows (children spawned afterwards are auto-included) and the process
|
||||||
// whole descendant tree. Children spawned afterwards are auto-included.
|
// group on Unix. On failure (e.g. Windows nested-job restrictions) we fall
|
||||||
// On failure (e.g. nested-job restrictions) we fall back to the default
|
// back to the default single-process kill; WaitDelay + end-of-job cleanup
|
||||||
// single-process kill; WaitDelay + end-of-job cleanup still apply.
|
// still apply.
|
||||||
if k, kerr := newProcessKiller(cmd.Process); kerr != nil {
|
if k, kerr := newProcessKiller(cmd.Process); kerr != nil {
|
||||||
common.Logger(ctx).Warnf("process tree kill setup failed, falling back to single-process kill: %v", kerr)
|
common.Logger(ctx).Warnf("process tree kill setup failed, falling back to single-process kill: %v", kerr)
|
||||||
} else {
|
} else {
|
||||||
killer.Store(k)
|
killer.Store(k)
|
||||||
defer k.Close()
|
defer k.Close()
|
||||||
}
|
}
|
||||||
}
|
|
||||||
err = cmd.Wait()
|
err = cmd.Wait()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
var exitErr *exec.ExitError
|
var exitErr *exec.ExitError
|
||||||
|
|||||||
@@ -1,19 +1,29 @@
|
|||||||
// Copyright 2026 The Gitea Authors. All rights reserved.
|
// Copyright 2026 The Gitea Authors. All rights reserved.
|
||||||
// SPDX-License-Identifier: MIT
|
// SPDX-License-Identifier: MIT
|
||||||
|
|
||||||
//go:build !windows
|
//go:build plan9
|
||||||
|
|
||||||
package container
|
package container
|
||||||
|
|
||||||
import "os"
|
import "os"
|
||||||
|
|
||||||
// processKiller is a no-op on non-Windows platforms. The Job Object based
|
// processKiller falls back to single-process termination on platforms without
|
||||||
// tree-kill is only wired in on Windows (see exec()); elsewhere the default
|
// a process-group / Job Object tree-kill. The Job Object (Windows) and process
|
||||||
// exec.CommandContext cancellation and Setpgid handling apply.
|
// group (Unix) based tree-kills live in process_windows.go / process_unix.go;
|
||||||
type processKiller struct{}
|
// here we just kill the direct child, matching the previous default behaviour.
|
||||||
|
type processKiller struct {
|
||||||
|
p *os.Process
|
||||||
|
}
|
||||||
|
|
||||||
func newProcessKiller(_ *os.Process) (*processKiller, error) { return &processKiller{}, nil }
|
func newProcessKiller(p *os.Process) (*processKiller, error) {
|
||||||
|
return &processKiller{p: p}, nil
|
||||||
|
}
|
||||||
|
|
||||||
func (k *processKiller) Kill() error { return nil }
|
func (k *processKiller) Kill() error {
|
||||||
|
if k == nil || k.p == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return k.p.Kill()
|
||||||
|
}
|
||||||
|
|
||||||
func (k *processKiller) Close() error { return nil }
|
func (k *processKiller) Close() error { return nil }
|
||||||
|
|||||||
56
act/container/process_unix.go
Normal file
56
act/container/process_unix.go
Normal file
@@ -0,0 +1,56 @@
|
|||||||
|
// Copyright 2026 The Gitea Authors. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: MIT
|
||||||
|
|
||||||
|
//go:build !windows && !plan9
|
||||||
|
|
||||||
|
package container
|
||||||
|
|
||||||
|
import (
|
||||||
|
"errors"
|
||||||
|
"os"
|
||||||
|
"syscall"
|
||||||
|
)
|
||||||
|
|
||||||
|
// processKiller terminates a step process together with its whole process
|
||||||
|
// group, which is the Unix counterpart of the Windows Job Object tree-kill.
|
||||||
|
//
|
||||||
|
// Background: a step often launches a process tree (a shell that starts a child
|
||||||
|
// which in turn spawns further background processes). The default
|
||||||
|
// exec.CommandContext cancellation only kills the direct child, so cancelling a
|
||||||
|
// job left the rest of the tree running. Because those orphans inherited the
|
||||||
|
// step's stdout/stderr pipe, cmd.Wait() also blocked forever and the runner
|
||||||
|
// hung.
|
||||||
|
//
|
||||||
|
// Steps are started with Setpgid (or Setsid for the PTY path, see
|
||||||
|
// getSysProcAttr), which makes the step process the leader of a new process
|
||||||
|
// group whose ID equals its PID. Signalling the negative PID delivers to every
|
||||||
|
// process still in that group, so we can tear down the whole tree atomically on
|
||||||
|
// cancellation, which also closes the inherited pipe handles so cmd.Wait() can
|
||||||
|
// return.
|
||||||
|
type processKiller struct {
|
||||||
|
pgid int
|
||||||
|
}
|
||||||
|
|
||||||
|
// newProcessKiller captures the process group of p (an already-started
|
||||||
|
// process). Because the step is launched with Setpgid/Setsid, p is a group
|
||||||
|
// leader and its PGID equals its PID; children spawned afterwards stay in the
|
||||||
|
// same group unless they explicitly create their own.
|
||||||
|
func newProcessKiller(p *os.Process) (*processKiller, error) {
|
||||||
|
return &processKiller{pgid: p.Pid}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Kill sends SIGKILL to the entire process group (the step process and every
|
||||||
|
// descendant that stayed in the group). A missing group (ESRCH) means the
|
||||||
|
// processes already exited and is not treated as an error.
|
||||||
|
func (k *processKiller) Kill() error {
|
||||||
|
if k == nil || k.pgid <= 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if err := syscall.Kill(-k.pgid, syscall.SIGKILL); err != nil && !errors.Is(err, syscall.ESRCH) {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Close is a no-op on Unix; there is no job handle to release.
|
||||||
|
func (k *processKiller) Close() error { return nil }
|
||||||
100
act/container/process_unix_test.go
Normal file
100
act/container/process_unix_test.go
Normal file
@@ -0,0 +1,100 @@
|
|||||||
|
// Copyright 2026 The Gitea Authors. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: MIT
|
||||||
|
|
||||||
|
//go:build !windows && !plan9
|
||||||
|
|
||||||
|
package container
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
"path/filepath"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"syscall"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/stretchr/testify/require"
|
||||||
|
)
|
||||||
|
|
||||||
|
// processAlive reports whether pid refers to a still-running process. Signal 0
|
||||||
|
// performs error checking without delivering a signal: a nil error (or EPERM)
|
||||||
|
// means the process exists, ESRCH means it is gone.
|
||||||
|
//
|
||||||
|
// On Linux, zombie processes (state Z in /proc/<pid>/stat) appear alive to
|
||||||
|
// kill(0) but have already terminated — their corpse lingers until the parent
|
||||||
|
// calls wait(). In a Docker container the child may be reparented to a PID 1
|
||||||
|
// that does not reap promptly, so we treat zombies as not alive.
|
||||||
|
func processAlive(pid int) bool {
|
||||||
|
err := syscall.Kill(pid, 0)
|
||||||
|
if err != nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
// On Linux /proc is available; check whether the process is a zombie.
|
||||||
|
if b, readErr := os.ReadFile(fmt.Sprintf("/proc/%d/stat", pid)); readErr == nil {
|
||||||
|
// Format: "pid (comm) state ..." — state follows the closing ')' of the
|
||||||
|
// command name (which may itself contain spaces and parens).
|
||||||
|
rest := string(b)
|
||||||
|
if idx := strings.LastIndex(rest, ") "); idx >= 0 {
|
||||||
|
fields := strings.Fields(rest[idx+2:])
|
||||||
|
if len(fields) > 0 && fields[0] == "Z" {
|
||||||
|
return false // zombie: terminated but not yet reaped
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestProcessKillerKillsTree verifies that a process group captured by the
|
||||||
|
// killer is terminated together with a child the step spawns afterwards. This
|
||||||
|
// mirrors a step that launches a child which spawns further processes, where
|
||||||
|
// cancelling the job must take down the whole tree, not just the direct child.
|
||||||
|
func TestProcessKillerKillsTree(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
pidFile := filepath.Join(dir, "child.pid")
|
||||||
|
|
||||||
|
// Parent shell backgrounds a long-lived child (writing its PID to a file)
|
||||||
|
// and then sleeps. With job control off (non-interactive sh) the backgrounded
|
||||||
|
// child stays in the parent's process group, so the group kill must reach it.
|
||||||
|
script := fmt.Sprintf(`sleep 600 & echo $! > %q; sleep 600`, pidFile)
|
||||||
|
cmd := exec.Command("/bin/sh", "-c", script)
|
||||||
|
// Launch as its own process-group leader, exactly like a real step does (see
|
||||||
|
// getSysProcAttr), so the killer's PGID == the process PID.
|
||||||
|
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
|
||||||
|
require.NoError(t, cmd.Start())
|
||||||
|
t.Cleanup(func() {
|
||||||
|
_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
|
||||||
|
_ = cmd.Wait()
|
||||||
|
})
|
||||||
|
|
||||||
|
killer, err := newProcessKiller(cmd.Process)
|
||||||
|
require.NoError(t, err)
|
||||||
|
defer killer.Close()
|
||||||
|
|
||||||
|
// Wait for the backgrounded child PID to be reported.
|
||||||
|
var childPID int
|
||||||
|
require.Eventually(t, func() bool {
|
||||||
|
b, e := os.ReadFile(pidFile)
|
||||||
|
if e != nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
s := strings.TrimSpace(string(b))
|
||||||
|
if s == "" {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
childPID, _ = strconv.Atoi(s)
|
||||||
|
return childPID > 0 && processAlive(childPID)
|
||||||
|
}, 20*time.Second, 100*time.Millisecond, "child process should start")
|
||||||
|
|
||||||
|
// Killing the group must terminate both the parent and the backgrounded child.
|
||||||
|
require.NoError(t, killer.Kill())
|
||||||
|
// Reap the parent so it does not linger as a zombie (which would still report
|
||||||
|
// as alive); SIGKILL makes Wait return promptly.
|
||||||
|
_ = cmd.Wait()
|
||||||
|
|
||||||
|
require.Eventually(t, func() bool {
|
||||||
|
return !processAlive(childPID)
|
||||||
|
}, 20*time.Second, 100*time.Millisecond, "backgrounded child should be terminated")
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user