feat: Add optional runner.post_task_script hook after task cleanup (#1026)

- Adds `runner.post_task_script` and `runner.post_task_script_timeout` (default `5m`) to run a host executable after each task’s built-in cleanup (post-steps, container teardown, bind-workdir removal).
- Stops task heartbeats via `Reporter.StopHeartbeats()` while the script runs so Gitea won’t assign overlapping work; the final task acknowledgement still happens in `reporter.Close()`.
- Script output goes to the runner process log; non-zero exits are warned only and do not change the job result.
- Documents lifecycle, offline behavior, timeouts, and Windows limits (`.ps1` not supported yet) in `docs/post-task-script.md`.

Reviewed-on: https://gitea.com/gitea/runner/pulls/1026
Reviewed-by: Zettat123 <39446+zettat123@noreply.gitea.com>
This commit is contained in:
Nicolas
2026-06-19 19:28:10 +00:00
parent df0370f8bf
commit 007717956a
28 changed files with 922 additions and 263 deletions

View File

@@ -0,0 +1,132 @@
// Copyright 2026 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package run
import (
"context"
"errors"
"fmt"
"io"
"os/exec"
"strconv"
"strings"
"time"
"gitea.com/gitea/runner/act/common"
"gitea.com/gitea/runner/internal/pkg/config"
"gitea.com/gitea/runner/internal/pkg/metrics"
"gitea.com/gitea/runner/internal/pkg/process"
"gitea.com/gitea/runner/internal/pkg/report"
runnerv1 "gitea.dev/actions-proto-go/runner/v1"
log "github.com/sirupsen/logrus"
)
func (r *Runner) runPostTaskScript(ctx context.Context, reporter *report.Reporter, task *runnerv1.Task, workdir string) {
script := r.cfg.Runner.PostTaskScript
if script == "" {
return
}
timeout := r.cfg.Runner.PostTaskScriptTimeout
if timeout <= 0 {
timeout = config.DefaultPostTaskScriptTimeout
}
scriptCtx, cancel := postTaskScriptContext(ctx, timeout)
defer cancel()
env := r.postTaskScriptEnv(reporter, task, workdir)
log.Infof("running post-task script %q for task %d", script, task.Id)
cmd := exec.CommandContext(scriptCtx, script)
cmd.Env = envListFromMap(env)
cmd.SysProcAttr = process.SysProcAttr(script, false)
stdout := postTaskScriptLogWriter("stdout")
stderr := postTaskScriptLogWriter("stderr")
cmd.Stdout = stdout
cmd.Stderr = stderr
// Kill the script's whole process tree on cancellation and bound the post-exit
// I/O wait, so a backgrounded child inheriting cmd's stdout/stderr pipe can
// never hang cmd.Wait() and the runner. See process.TreeKill.
treeKill := process.NewTreeKill(cmd)
if err := cmd.Start(); err != nil {
log.Warnf("post-task script %q for task %d: %v", script, task.Id, err)
return
}
if k, kerr := treeKill.Capture(cmd.Process); kerr != nil {
log.Warnf("post-task script %q for task %d: process tree kill setup failed, falling back to single-process kill: %v", script, task.Id, kerr)
} else {
defer k.Close()
}
err := cmd.Wait()
// Flush any trailing, not-yet-newline-terminated output now that the I/O
// copiers have finished (cmd.Wait, bounded by WaitDelay above, guarantees it).
common.FlushWriter(stdout)
common.FlushWriter(stderr)
if err != nil {
if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
log.Warnf("post-task script %q for task %d: %v", script, task.Id, err)
return
}
var exitErr *exec.ExitError
if errors.As(err, &exitErr) {
log.Warnf("post-task script %q for task %d exited with code %d", script, task.Id, exitErr.ExitCode())
return
}
log.Warnf("post-task script %q for task %d: %v", script, task.Id, err)
}
}
func postTaskScriptContext(ctx context.Context, timeout time.Duration) (context.Context, context.CancelFunc) {
// Detach from the task context's deadline and cancellation: the task has
// already finished by the time the post-task script runs, so the script must
// get its full configured timeout. Inheriting the task deadline would silently
// truncate that budget when the job completed close to its own timeout (and an
// already-cancelled task context would skip the script entirely).
// context.WithoutCancel keeps the context values while dropping the deadline.
return context.WithTimeout(context.WithoutCancel(ctx), timeout)
}
func (r *Runner) postTaskScriptEnv(reporter *report.Reporter, task *runnerv1.Task, workdir string) map[string]string {
env := r.cloneEnvs()
env["GITEA_TASK_ID"] = strconv.FormatInt(task.Id, 10)
env["GITEA_WORKSPACE"] = workdir
// GITEA_JOB_RESULT shares the runner's canonical result vocabulary
// (success/failure/cancelled/skipped/unknown), the same strings the reporter
// parses and the metrics labels use.
env["GITEA_JOB_RESULT"] = metrics.ResultToStatusLabel(reporter.Result())
if v := task.Context.Fields["run_id"].GetStringValue(); v != "" {
env["GITEA_RUN_ID"] = v
}
if v := task.Context.Fields["repository"].GetStringValue(); v != "" {
env["GITEA_REPOSITORY"] = v
}
return env
}
func envListFromMap(env map[string]string) []string {
envList := make([]string, 0, len(env))
for k, v := range env {
envList = append(envList, fmt.Sprintf("%s=%s", k, v))
}
return envList
}
// postTaskScriptLogWriter returns an io.Writer that logs the script's output one
// line at a time, tagged with the stream name. It is passed as cmd.Stdout/Stderr
// (rather than a StdoutPipe) so that cmd.WaitDelay governs the copying goroutine:
// a backgrounded process holding the pipe open can never block cmd.Wait()
// indefinitely. Flush any trailing partial line with common.FlushWriter after
// cmd.Wait() returns.
func postTaskScriptLogWriter(stream string) io.Writer {
return common.NewLineWriter(func(line string) bool {
log.Infof("post-task script %s: %s", stream, strings.TrimRight(line, "\r\n"))
return true
})
}

View File

@@ -0,0 +1,157 @@
// Copyright 2026 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package run
import (
"context"
"os"
"path/filepath"
"testing"
"time"
"gitea.com/gitea/runner/internal/pkg/config"
"gitea.com/gitea/runner/internal/pkg/metrics"
"gitea.com/gitea/runner/internal/pkg/report"
runnerv1 "gitea.dev/actions-proto-go/runner/v1"
log "github.com/sirupsen/logrus"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"google.golang.org/protobuf/types/known/structpb"
)
func TestRunPostTaskScriptSkippedWhenEmpty(t *testing.T) {
r := &Runner{
cfg: &config.Config{},
}
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
taskCtx, err := structpb.NewStruct(map[string]any{})
require.NoError(t, err)
task := &runnerv1.Task{Id: 1, Context: taskCtx}
reporter := report.NewReporter(ctx, cancel, nil, task, r.cfg)
require.NotPanics(t, func() {
r.runPostTaskScript(ctx, reporter, task, "/workspace/owner/repo")
})
}
func TestRunPostTaskScriptNonZeroExitDoesNotPanic(t *testing.T) {
dir := t.TempDir()
scriptPath := filepath.Join(dir, "fail.sh")
require.NoError(t, os.WriteFile(scriptPath, []byte("#!/bin/sh\nexit 2\n"), 0o700))
cfg, err := config.LoadDefault("")
require.NoError(t, err)
cfg.Runner.PostTaskScript = scriptPath
r := &Runner{cfg: cfg}
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
taskCtx, err := structpb.NewStruct(map[string]any{})
require.NoError(t, err)
task := &runnerv1.Task{Id: 1, Context: taskCtx}
reporter := report.NewReporter(ctx, cancel, nil, task, cfg)
require.NotPanics(t, func() {
r.runPostTaskScript(ctx, reporter, task, "/workspace/owner/repo")
})
}
func TestPostTaskScriptContextUsesFullTimeout(t *testing.T) {
const timeout = 5 * time.Minute
// A task context that finished close to its own deadline must not truncate the
// script's budget: the script should still get its full configured timeout.
near, cancelNear := context.WithTimeout(context.Background(), time.Second)
defer cancelNear()
scriptCtx, cancel := postTaskScriptContext(near, timeout)
defer cancel()
deadline, ok := scriptCtx.Deadline()
require.True(t, ok)
assert.Greater(t, time.Until(deadline), time.Minute, "script timeout truncated to task deadline")
// An already-cancelled task context must not cancel the script either.
cancelledCtx, cancelIt := context.WithCancel(context.Background())
cancelIt()
scriptCtx2, cancel2 := postTaskScriptContext(cancelledCtx, timeout)
defer cancel2()
assert.NoError(t, scriptCtx2.Err(), "script context inherited the cancelled task context")
}
func TestPostTaskScriptEnv(t *testing.T) {
cfg, err := config.LoadDefault("")
require.NoError(t, err)
r := &Runner{
cfg: cfg,
envs: map[string]string{"BASE": "1"},
}
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
taskCtx, err := structpb.NewStruct(map[string]any{
"run_id": "99",
"repository": "acme/widget",
})
require.NoError(t, err)
task := &runnerv1.Task{Id: 3, Context: taskCtx}
reporter := report.NewReporter(ctx, cancel, nil, task, cfg)
setReporterJobResult(t, reporter, runnerv1.Result_RESULT_FAILURE)
env := r.postTaskScriptEnv(reporter, task, "/tmp/workspace")
assert.Equal(t, "1", env["BASE"])
assert.Equal(t, "3", env["GITEA_TASK_ID"])
assert.Equal(t, "99", env["GITEA_RUN_ID"])
assert.Equal(t, "acme/widget", env["GITEA_REPOSITORY"])
assert.Equal(t, "/tmp/workspace", env["GITEA_WORKSPACE"])
assert.Equal(t, "failure", env["GITEA_JOB_RESULT"])
}
func TestRunPostTaskScriptIntegration(t *testing.T) {
dir := t.TempDir()
outFile := filepath.Join(dir, "out.txt")
scriptPath := filepath.Join(dir, "post-task.sh")
script := "#!/bin/sh\nprintf '%s %s %s' \"$GITEA_TASK_ID\" \"$GITEA_JOB_RESULT\" \"$CUSTOM\" > \"" + outFile + "\"\n"
require.NoError(t, os.WriteFile(scriptPath, []byte(script), 0o700))
cfg, err := config.LoadDefault("")
require.NoError(t, err)
cfg.Runner.PostTaskScript = scriptPath
r := &Runner{
cfg: cfg,
envs: map[string]string{"CUSTOM": "runner-env"},
}
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
taskCtx, err := structpb.NewStruct(map[string]any{})
require.NoError(t, err)
task := &runnerv1.Task{Id: 11, Context: taskCtx}
reporter := report.NewReporter(ctx, cancel, nil, task, cfg)
setReporterJobResult(t, reporter, runnerv1.Result_RESULT_SUCCESS)
r.runPostTaskScript(ctx, reporter, task, "/workspace/acme/repo")
content, err := os.ReadFile(outFile)
require.NoError(t, err)
assert.Equal(t, "11 success runner-env", string(content))
}
func setReporterJobResult(t *testing.T, reporter *report.Reporter, result runnerv1.Result) {
t.Helper()
require.NoError(t, reporter.Fire(&log.Entry{
Time: time.Now(),
Message: "job finished",
Data: log.Fields{
"stage": "Post",
"jobResult": metrics.ResultToStatusLabel(result),
},
}))
}

View File

@@ -475,6 +475,9 @@ func (r *Runner) run(ctx context.Context, task *runnerv1.Task, reporter *report.
}
}
reporter.StopHeartbeats()
r.runPostTaskScript(ctx, reporter, task, workdir)
return execErr
}

View File

@@ -83,6 +83,21 @@ runner:
# terminal; tools like `docker build` emit redrawing progress frames into the captured log
# when a TTY is present.
allocate_pty: false
# Optional executable on the host, run once after each task's built-in cleanup
# (post-steps, container teardown, bind-workdir removal). Additive only.
#
# IMPORTANT: While this script runs the runner stops task heartbeats and stays
# offline from Gitea's perspective until the script exits. A script that never
# returns blocks new work until post_task_script_timeout kills it (default 5m).
# Keep scripts short; set post_task_script_timeout to a safe upper bound.
#
# Output -> runner process log (not the job log). Non-zero exit -> warning only.
# Windows: use .exe, .bat, or .cmd. PowerShell (.ps1) is not supported yet as
# the configured path; wrap PowerShell commands in a .cmd file instead.
# Full guide: docs/post-task-script.md
post_task_script: ''
# Hard limit on post_task_script runtime. Default if omitted: 5m.
post_task_script_timeout: 5m
cache:
# Enable the built-in cache server (used by actions/cache and similar actions).

View File

@@ -16,6 +16,12 @@ import (
"go.yaml.in/yaml/v4"
)
// DefaultPostTaskScriptTimeout is the fallback cap on how long the post-task
// script may run when post_task_script is set without an explicit timeout. It is
// applied both at config load (for a configured script) and at the point of use
// (so a programmatically built config still gets a sane bound).
const DefaultPostTaskScriptTimeout = 5 * time.Minute
// Log represents the configuration for logging.
type Log struct {
Level string `yaml:"level"` // Level indicates the logging level.
@@ -23,26 +29,28 @@ type Log struct {
// Runner represents the configuration for the runner.
type Runner struct {
File string `yaml:"file"` // File specifies the file path for the runner.
Capacity int `yaml:"capacity"` // Capacity specifies the capacity of the runner.
Envs map[string]string `yaml:"envs"` // Envs stores environment variables for the runner.
EnvFile string `yaml:"env_file"` // EnvFile specifies the path to the file containing environment variables for the runner.
Timeout time.Duration `yaml:"timeout"` // Timeout specifies the duration for runner timeout.
ShutdownTimeout time.Duration `yaml:"shutdown_timeout"` // ShutdownTimeout specifies the duration to wait for running jobs to complete during a shutdown of the runner.
Insecure bool `yaml:"insecure"` // Insecure indicates whether the runner operates in an insecure mode.
FetchTimeout time.Duration `yaml:"fetch_timeout"` // FetchTimeout specifies the timeout duration for fetching resources.
FetchInterval time.Duration `yaml:"fetch_interval"` // FetchInterval specifies the interval duration for fetching resources.
FetchIntervalMax time.Duration `yaml:"fetch_interval_max"` // FetchIntervalMax specifies the maximum backoff interval when idle.
WorkdirCleanupAge time.Duration `yaml:"workdir_cleanup_age"` // WorkdirCleanupAge removes stale bind-workdir task directories and orphaned host-mode scratch dirs older than this duration during idle cleanup.
IdleCleanupInterval time.Duration `yaml:"idle_cleanup_interval"` // IdleCleanupInterval runs stale-directory cleanup periodically while the runner is idle. Set to 0 to disable cleanup cadence.
LogReportInterval time.Duration `yaml:"log_report_interval"` // LogReportInterval specifies the base interval for periodic log flush.
LogReportMaxLatency time.Duration `yaml:"log_report_max_latency"` // LogReportMaxLatency specifies the max time a log row can wait before being sent.
LogReportBatchSize int `yaml:"log_report_batch_size"` // LogReportBatchSize triggers immediate log flush when buffer reaches this size.
StateReportInterval time.Duration `yaml:"state_report_interval"` // StateReportInterval specifies the interval for state reporting.
ReportCloseTimeout time.Duration `yaml:"report_close_timeout"` // ReportCloseTimeout caps each RPC attempt when flushing the final logs and task state at job completion, on a detached context so a server cancel can't block the acknowledgement.
Labels []string `yaml:"labels"` // Labels specify the labels of the runner. Labels are declared on each startup
GithubMirror string `yaml:"github_mirror"` // GithubMirror defines what mirrors should be used when using github
AllocatePTY bool `yaml:"allocate_pty"` // AllocatePTY allocates a pseudo-TTY for each step's process. Default is false, matching GitHub's actions/runner. Enable only for jobs that need an interactive terminal; tools like docker build emit redrawing progress frames into the captured log when a TTY is present. Applies to both host and docker backends.
File string `yaml:"file"` // File specifies the file path for the runner.
Capacity int `yaml:"capacity"` // Capacity specifies the capacity of the runner.
Envs map[string]string `yaml:"envs"` // Envs stores environment variables for the runner.
EnvFile string `yaml:"env_file"` // EnvFile specifies the path to the file containing environment variables for the runner.
Timeout time.Duration `yaml:"timeout"` // Timeout specifies the duration for runner timeout.
ShutdownTimeout time.Duration `yaml:"shutdown_timeout"` // ShutdownTimeout specifies the duration to wait for running jobs to complete during a shutdown of the runner.
Insecure bool `yaml:"insecure"` // Insecure indicates whether the runner operates in an insecure mode.
FetchTimeout time.Duration `yaml:"fetch_timeout"` // FetchTimeout specifies the timeout duration for fetching resources.
FetchInterval time.Duration `yaml:"fetch_interval"` // FetchInterval specifies the interval duration for fetching resources.
FetchIntervalMax time.Duration `yaml:"fetch_interval_max"` // FetchIntervalMax specifies the maximum backoff interval when idle.
WorkdirCleanupAge time.Duration `yaml:"workdir_cleanup_age"` // WorkdirCleanupAge removes stale bind-workdir task directories and orphaned host-mode scratch dirs older than this duration during idle cleanup.
IdleCleanupInterval time.Duration `yaml:"idle_cleanup_interval"` // IdleCleanupInterval runs stale-directory cleanup periodically while the runner is idle. Set to 0 to disable cleanup cadence.
LogReportInterval time.Duration `yaml:"log_report_interval"` // LogReportInterval specifies the base interval for periodic log flush.
LogReportMaxLatency time.Duration `yaml:"log_report_max_latency"` // LogReportMaxLatency specifies the max time a log row can wait before being sent.
LogReportBatchSize int `yaml:"log_report_batch_size"` // LogReportBatchSize triggers immediate log flush when buffer reaches this size.
StateReportInterval time.Duration `yaml:"state_report_interval"` // StateReportInterval specifies the interval for state reporting.
ReportCloseTimeout time.Duration `yaml:"report_close_timeout"` // ReportCloseTimeout caps each RPC attempt when flushing the final logs and task state at job completion, on a detached context so a server cancel can't block the acknowledgement.
Labels []string `yaml:"labels"` // Labels specify the labels of the runner. Labels are declared on each startup
GithubMirror string `yaml:"github_mirror"` // GithubMirror defines what mirrors should be used when using github
AllocatePTY bool `yaml:"allocate_pty"` // AllocatePTY allocates a pseudo-TTY for each step's process. Default is false, matching GitHub's actions/runner. Enable only for jobs that need an interactive terminal; tools like docker build emit redrawing progress frames into the captured log when a TTY is present. Applies to both host and docker backends.
PostTaskScript string `yaml:"post_task_script"` // PostTaskScript is the path to an executable script run on the host after each task's cleanup completes. Empty disables the hook. On Windows use .exe/.bat/.cmd; PowerShell (.ps1) is not supported yet as the configured path.
PostTaskScriptTimeout time.Duration `yaml:"post_task_script_timeout"` // PostTaskScriptTimeout caps how long the post-task script may run. Default is 5m when post_task_script is set.
}
// Cache represents the configuration for caching.
@@ -193,6 +201,9 @@ func LoadDefault(file string) (*Config, error) {
if cfg.Runner.ReportCloseTimeout <= 0 {
cfg.Runner.ReportCloseTimeout = 10 * time.Second
}
if cfg.Runner.PostTaskScript != "" && cfg.Runner.PostTaskScriptTimeout <= 0 {
cfg.Runner.PostTaskScriptTimeout = DefaultPostTaskScriptTimeout
}
if cfg.Metrics.Addr == "" {
cfg.Metrics.Addr = "127.0.0.1:9101"
}

View File

@@ -107,6 +107,34 @@ runner:
// TestLoadDefault_MalformedYAMLReturnsParseError pins the error surfaced for
// invalid YAML to the canonical "parse config file" message rather than the
// "for defaults metadata" variant — i.e. the main yaml.Unmarshal runs first.
func TestLoadDefault_LoadsPostTaskScript(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "config.yaml")
require.NoError(t, os.WriteFile(path, []byte(`
runner:
post_task_script: /usr/local/bin/post-task.sh
post_task_script_timeout: 2m
`), 0o600))
cfg, err := LoadDefault(path)
require.NoError(t, err)
assert.Equal(t, "/usr/local/bin/post-task.sh", cfg.Runner.PostTaskScript)
assert.Equal(t, 2*time.Minute, cfg.Runner.PostTaskScriptTimeout)
}
func TestLoadDefault_DefaultsPostTaskScriptTimeout(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "config.yaml")
require.NoError(t, os.WriteFile(path, []byte(`
runner:
post_task_script: /usr/local/bin/post-task.sh
`), 0o600))
cfg, err := LoadDefault(path)
require.NoError(t, err)
assert.Equal(t, 5*time.Minute, cfg.Runner.PostTaskScriptTimeout)
}
func TestLoadDefault_MalformedYAMLReturnsParseError(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "config.yaml")

View File

@@ -0,0 +1,29 @@
// Copyright 2026 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
//go:build plan9
package process
import "os"
// Killer falls back to single-process termination on platforms without a
// process-group / Job Object tree-kill. The Job Object (Windows) and process
// group (Unix) based tree-kills live in killer_windows.go / killer_unix.go;
// here we just kill the direct child, matching the previous default behaviour.
type Killer struct {
p *os.Process
}
func NewKiller(p *os.Process) (*Killer, error) {
return &Killer{p: p}, nil
}
func (k *Killer) Kill() error {
if k == nil || k.p == nil {
return nil
}
return k.p.Kill()
}
func (k *Killer) Close() error { return nil }

View File

@@ -0,0 +1,56 @@
// Copyright 2026 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
//go:build !windows && !plan9
package process
import (
"errors"
"os"
"syscall"
)
// Killer terminates a started process together with its whole process group,
// which is the Unix counterpart of the Windows Job Object tree-kill.
//
// Background: a process (a step or a post-task script) often launches a process
// tree (a shell that starts a child which in turn spawns further background
// processes). The default exec.CommandContext cancellation only kills the
// direct child, so cancelling left the rest of the tree running. Because those
// orphans inherited the parent's stdout/stderr pipe, cmd.Wait() also blocked
// forever and the runner hung.
//
// Processes are started with Setpgid (or Setsid for the PTY path, see
// SysProcAttr), which makes the process the leader of a new process group whose
// ID equals its PID. Signalling the negative PID delivers to every process
// still in that group, so we can tear down the whole tree atomically on
// cancellation, which also closes the inherited pipe handles so cmd.Wait() can
// return.
type Killer struct {
pgid int
}
// NewKiller captures the process group of p (an already-started process).
// Because the process is launched with Setpgid/Setsid, p is a group leader and
// its PGID equals its PID; children spawned afterwards stay in the same group
// unless they explicitly create their own.
func NewKiller(p *os.Process) (*Killer, error) {
return &Killer{pgid: p.Pid}, nil
}
// Kill sends SIGKILL to the entire process group (the process and every
// descendant that stayed in the group). A missing group (ESRCH) means the
// processes already exited and is not treated as an error.
func (k *Killer) Kill() error {
if k == nil || k.pgid <= 0 {
return nil
}
if err := syscall.Kill(-k.pgid, syscall.SIGKILL); err != nil && !errors.Is(err, syscall.ESRCH) {
return err
}
return nil
}
// Close is a no-op on Unix; there is no job handle to release.
func (k *Killer) Close() error { return nil }

View File

@@ -0,0 +1,101 @@
// Copyright 2026 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
//go:build !windows && !plan9
package process
import (
"fmt"
"os"
"os/exec"
"path/filepath"
"strconv"
"strings"
"syscall"
"testing"
"time"
"github.com/stretchr/testify/require"
)
// processAlive reports whether pid refers to a still-running process. Signal 0
// performs error checking without delivering a signal: a nil error (or EPERM)
// means the process exists, ESRCH means it is gone.
//
// On Linux, zombie processes (state Z in /proc/<pid>/stat) appear alive to
// kill(0) but have already terminated — their corpse lingers until the parent
// calls wait(). In a Docker container the child may be reparented to a PID 1
// that does not reap promptly, so we treat zombies as not alive.
func processAlive(pid int) bool {
err := syscall.Kill(pid, 0)
if err != nil {
return false
}
// On Linux /proc is available; check whether the process is a zombie.
if b, readErr := os.ReadFile(fmt.Sprintf("/proc/%d/stat", pid)); readErr == nil {
// Format: "pid (comm) state ..." — state follows the closing ')' of the
// command name (which may itself contain spaces and parens).
rest := string(b)
if idx := strings.LastIndex(rest, ") "); idx >= 0 {
fields := strings.Fields(rest[idx+2:])
if len(fields) > 0 && fields[0] == "Z" {
return false // zombie: terminated but not yet reaped
}
}
}
return true
}
// TestKillerKillsTree verifies that a process group captured by the killer is
// terminated together with a child the process spawns afterwards. This mirrors
// a step or post-task script that launches a child which spawns further
// processes, where cancelling must take down the whole tree, not just the
// direct child.
func TestKillerKillsTree(t *testing.T) {
dir := t.TempDir()
pidFile := filepath.Join(dir, "child.pid")
// Parent shell backgrounds a long-lived child (writing its PID to a file)
// and then sleeps. With job control off (non-interactive sh) the backgrounded
// child stays in the parent's process group, so the group kill must reach it.
script := fmt.Sprintf(`sleep 600 & echo $! > %q; sleep 600`, pidFile)
cmd := exec.Command("/bin/sh", "-c", script)
// Launch as its own process-group leader, exactly like a real process does
// (see SysProcAttr), so the killer's PGID == the process PID.
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
require.NoError(t, cmd.Start())
t.Cleanup(func() {
_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
_ = cmd.Wait()
})
killer, err := NewKiller(cmd.Process)
require.NoError(t, err)
defer killer.Close()
// Wait for the backgrounded child PID to be reported.
var childPID int
require.Eventually(t, func() bool {
b, e := os.ReadFile(pidFile)
if e != nil {
return false
}
s := strings.TrimSpace(string(b))
if s == "" {
return false
}
childPID, _ = strconv.Atoi(s)
return childPID > 0 && processAlive(childPID)
}, 20*time.Second, 100*time.Millisecond, "child process should start")
// Killing the group must terminate both the parent and the backgrounded child.
require.NoError(t, killer.Kill())
// Reap the parent so it does not linger as a zombie (which would still report
// as alive); SIGKILL makes Wait return promptly.
_ = cmd.Wait()
require.Eventually(t, func() bool {
return !processAlive(childPID)
}, 20*time.Second, 100*time.Millisecond, "backgrounded child should be terminated")
}

View File

@@ -0,0 +1,72 @@
// Copyright 2026 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package process
import (
"os"
"golang.org/x/sys/windows"
)
// Killer terminates a started process together with its entire descendant tree
// via a Windows Job Object.
//
// Background: a process (a step or a post-task script) often launches a process
// tree (a shell that starts a child which in turn spawns further GUI or
// background processes). The default exec.CommandContext cancellation only kills
// the direct child, so cancelling left the rest of the tree running. Because
// those orphans inherited the parent's stdout/stderr pipe, cmd.Wait() also
// blocked forever and the runner hung.
//
// Assigning the process to a Job Object lets us kill the whole tree atomically
// on cancellation (TerminateJobObject), which also closes the inherited pipe
// handles so cmd.Wait() can return.
type Killer struct {
job windows.Handle
}
// NewKiller creates a Job Object and assigns p (an already-started process) to
// it. Children spawned by p afterwards are automatically part of the job. The
// job does NOT use JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE, so closing the handle on
// normal completion does not kill legitimate background processes; the tree is
// only torn down by an explicit Kill (cancellation).
func NewKiller(p *os.Process) (*Killer, error) {
job, err := windows.CreateJobObject(nil, nil)
if err != nil {
return nil, err
}
h, err := windows.OpenProcess(windows.PROCESS_SET_QUOTA|windows.PROCESS_TERMINATE, false, uint32(p.Pid))
if err != nil {
windows.CloseHandle(job)
return nil, err
}
defer windows.CloseHandle(h)
if err := windows.AssignProcessToJobObject(job, h); err != nil {
windows.CloseHandle(job)
return nil, err
}
return &Killer{job: job}, nil
}
// Kill terminates every process currently assigned to the job (the process and
// all of its descendants).
func (k *Killer) Kill() error {
if k == nil || k.job == 0 {
return nil
}
return windows.TerminateJobObject(k.job, 1)
}
// Close releases the job handle. It does not terminate the processes.
func (k *Killer) Close() error {
if k == nil || k.job == 0 {
return nil
}
h := k.job
k.job = 0
return windows.CloseHandle(h)
}

View File

@@ -0,0 +1,78 @@
// Copyright 2026 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package process
import (
"fmt"
"os"
"os/exec"
"path/filepath"
"strconv"
"strings"
"testing"
"time"
"github.com/stretchr/testify/require"
"golang.org/x/sys/windows"
)
// processAlive reports whether pid refers to a still-running process.
func processAlive(pid int) bool {
h, err := windows.OpenProcess(windows.PROCESS_QUERY_LIMITED_INFORMATION, false, uint32(pid))
if err != nil {
return false
}
defer windows.CloseHandle(h)
var code uint32
if err := windows.GetExitCodeProcess(h, &code); err != nil {
return false
}
const stillActive = 259 // STILL_ACTIVE
return code == stillActive
}
// TestKillerKillsTree verifies that a process assigned to the Job Object is
// terminated together with a child it spawns afterwards. This mirrors a step or
// post-task script that launches a child which spawns further processes, where
// cancelling must take down the whole tree, not just the direct child.
func TestKillerKillsTree(t *testing.T) {
dir := t.TempDir()
pidFile := filepath.Join(dir, "child.pid")
// Parent powershell spawns a detached, long-lived child powershell (writing
// its PID to a file) and then sleeps. The child is launched AFTER the parent
// has been assigned to the job, so it must be captured by the job too.
script := fmt.Sprintf(
`$c = Start-Process powershell -PassThru -ArgumentList '-NoProfile','-Command','Start-Sleep -Seconds 600'; `+
`Set-Content -LiteralPath %q -Value $c.Id; Start-Sleep -Seconds 600`, pidFile)
cmd := exec.Command("powershell.exe", "-NoProfile", "-Command", script)
require.NoError(t, cmd.Start())
t.Cleanup(func() { _ = cmd.Process.Kill() })
killer, err := NewKiller(cmd.Process)
require.NoError(t, err)
defer killer.Close()
// Wait for the child PID to be reported.
var childPID int
require.Eventually(t, func() bool {
b, e := os.ReadFile(pidFile)
if e != nil {
return false
}
s := strings.TrimSpace(string(b))
if s == "" {
return false
}
childPID, _ = strconv.Atoi(s)
return childPID > 0 && processAlive(childPID)
}, 20*time.Second, 200*time.Millisecond, "child process should start")
// Killing the job must terminate both the parent and the detached child.
require.NoError(t, killer.Kill())
require.Eventually(t, func() bool {
return !processAlive(cmd.Process.Pid) && !processAlive(childPID)
}, 20*time.Second, 200*time.Millisecond, "parent and child should both be terminated")
}

View File

@@ -0,0 +1,17 @@
// Copyright 2026 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
//go:build plan9
package process
import "syscall"
// SysProcAttr returns the platform attributes used to start a process. Plan 9
// has no process-group tree-kill (see Killer), so we only request a new rfork
// note group here.
func SysProcAttr(cmdLine string, tty bool) *syscall.SysProcAttr {
return &syscall.SysProcAttr{
Rfork: syscall.RFNOTEG,
}
}

View File

@@ -0,0 +1,24 @@
// Copyright 2026 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
//go:build !windows && !plan9
package process
import "syscall"
// SysProcAttr returns the platform attributes used to start a process so that a
// Killer can later tear down its whole process tree. On Unix the process becomes
// the leader of a new process group (or session, for the PTY path), so a
// signal to the negative PID reaches every descendant that stayed in the group.
func SysProcAttr(_ string, tty bool) *syscall.SysProcAttr {
if tty {
return &syscall.SysProcAttr{
Setsid: true,
Setctty: true,
}
}
return &syscall.SysProcAttr{
Setpgid: true,
}
}

View File

@@ -0,0 +1,14 @@
// Copyright 2026 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package process
import "syscall"
// SysProcAttr returns the platform attributes used to start a process so that a
// Killer can later tear down its whole process tree. On Windows the process is
// placed in a new process group; the descendant tree is reclaimed via the Job
// Object set up by NewKiller.
func SysProcAttr(cmdLine string, tty bool) *syscall.SysProcAttr {
return &syscall.SysProcAttr{CmdLine: cmdLine, CreationFlags: syscall.CREATE_NEW_PROCESS_GROUP}
}

View File

@@ -0,0 +1,66 @@
// Copyright 2026 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package process
import (
"os"
"os/exec"
"sync/atomic"
"time"
)
// treeKillWaitDelay bounds how long Wait lingers for the command's I/O pipes to
// drain after the process exits before force-closing them and returning. It also
// covers a command that backgrounds a process holding a pipe open after a clean
// exit.
const treeKillWaitDelay = 10 * time.Second
// TreeKill wires an exec.Cmd so that cancelling it tears down the command's
// whole process tree (see Killer) rather than only the direct child, and bounds
// the post-exit I/O wait so a leftover pipe writer can never hang cmd.Wait.
//
// Background: a command often launches a process tree (a shell that starts a
// child which spawns further background processes). The default
// exec.CommandContext cancellation only kills the direct child, leaving the rest
// of the tree running; and because the orphans inherit cmd's stdout/stderr pipe,
// cmd.Wait() would block forever, hanging the caller.
//
// Callers still set cmd.SysProcAttr (via SysProcAttr) themselves, because the
// value differs between the plain and PTY execution paths.
type TreeKill struct {
killer atomic.Pointer[Killer]
}
// NewTreeKill sets cmd.Cancel and cmd.WaitDelay. Call it before cmd.Start, then
// call Capture once after a successful Start.
func NewTreeKill(cmd *exec.Cmd) *TreeKill {
t := &TreeKill{}
cmd.Cancel = func() error {
if k := t.killer.Load(); k != nil {
return k.Kill()
}
if cmd.Process != nil {
return cmd.Process.Kill()
}
return nil
}
cmd.WaitDelay = treeKillWaitDelay
return t
}
// Capture assigns the started process (and the descendants it spawns) to a
// Killer so cancellation can reach the whole tree — a Job Object on Windows
// (children spawned afterwards are auto-included) and the process group on Unix.
// Call it once after cmd.Start. On failure the command falls back to the default
// single-process kill and the returned error is for logging only; WaitDelay
// still bounds the wait. The returned Killer should be closed when the command
// finishes (Close is nil-safe).
func (t *TreeKill) Capture(p *os.Process) (*Killer, error) {
k, err := NewKiller(p)
if err != nil {
return nil, err
}
t.killer.Store(k)
return k, nil
}

View File

@@ -44,11 +44,13 @@ type Reporter struct {
// so the gauge skips no-op Set calls when the buffer size is unchanged.
lastLogBufferRows int
state *runnerv1.TaskState
stateChanged bool
stateMu sync.RWMutex
outputs sync.Map
daemon chan struct{}
state *runnerv1.TaskState
stateChanged bool
stateMu sync.RWMutex
outputs sync.Map
daemon chan struct{}
heartbeatStop chan struct{}
heartbeatStopOnce sync.Once
// Unix-nanos of the last successful UpdateTask. Atomic so the heartbeat
// guard in ReportState reads it without contending stateMu.
@@ -99,7 +101,8 @@ func NewReporter(ctx context.Context, cancel context.CancelFunc, client client.C
state: &runnerv1.TaskState{
Id: task.Id,
},
daemon: make(chan struct{}),
daemon: make(chan struct{}),
heartbeatStop: make(chan struct{}),
}
if task.Secrets["ACTIONS_STEP_DEBUG"] == "true" {
@@ -273,6 +276,15 @@ func (r *Reporter) RunDaemon() {
go r.runDaemonLoop()
}
// StopHeartbeats stops periodic UpdateTask heartbeats without cancelling the
// task context. Close() still delivers the final flush. Safe to call multiple
// times and when the context is already cancelled.
func (r *Reporter) StopHeartbeats() {
r.heartbeatStopOnce.Do(func() {
close(r.heartbeatStop)
})
}
func (r *Reporter) stopLatencyTimer(active *bool, timer *time.Timer) {
if *active {
if !timer.Stop() {
@@ -339,6 +351,12 @@ func (r *Reporter) runDaemonLoop() {
// delivers the final flush on a detached context (flushFinal).
close(r.daemon)
return
case <-r.heartbeatStop:
// Stop heartbeating during post-task script execution. Close() still
// delivers the final flush on a detached context (flushFinal).
close(r.daemon)
return
}
r.stateMu.RLock()

View File

@@ -921,3 +921,65 @@ func TestReporter_CloseReportsCancelledOnCanceledCtx(t *testing.T) {
assert.True(t, foundCancelled, "final log must contain a 'Cancelled' row")
assert.False(t, foundEarlyTermination, "final log must not contain 'Early termination' on the cancel path")
}
// TestReporter_StopHeartbeats verifies that StopHeartbeats ends periodic
// UpdateTask heartbeats while Close() still flushes the final state.
func TestReporter_StopHeartbeats(t *testing.T) {
var updateTaskCalls atomic.Int64
client := mocks.NewClient(t)
client.On("UpdateLog", mock.Anything, mock.Anything).Maybe().Return(
func(_ context.Context, req *connect_go.Request[runnerv1.UpdateLogRequest]) (*connect_go.Response[runnerv1.UpdateLogResponse], error) {
return connect_go.NewResponse(&runnerv1.UpdateLogResponse{
AckIndex: req.Msg.Index + int64(len(req.Msg.Rows)),
}), nil
},
)
client.On("UpdateTask", mock.Anything, mock.Anything).Return(
func(_ context.Context, _ *connect_go.Request[runnerv1.UpdateTaskRequest]) (*connect_go.Response[runnerv1.UpdateTaskResponse], error) {
updateTaskCalls.Add(1)
return connect_go.NewResponse(&runnerv1.UpdateTaskResponse{}), nil
},
)
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
taskCtx, err := structpb.NewStruct(map[string]any{})
require.NoError(t, err)
cfg, err := config.LoadDefault("")
require.NoError(t, err)
cfg.Runner.StateReportInterval = 20 * time.Millisecond
cfg.Runner.LogReportInterval = time.Hour
reporter := NewReporter(ctx, cancel, client, &runnerv1.Task{Context: taskCtx}, cfg)
reporter.ResetSteps(1)
reporter.RunDaemon()
reporter.stateMu.Lock()
reporter.stateChanged = true
reporter.state.Result = runnerv1.Result_RESULT_SUCCESS
reporter.state.StoppedAt = timestamppb.Now()
reporter.stateMu.Unlock()
require.Eventually(t, func() bool {
return updateTaskCalls.Load() >= 1
}, time.Second, 5*time.Millisecond, "daemon must send at least one UpdateTask before StopHeartbeats")
beforeStop := updateTaskCalls.Load()
reporter.StopHeartbeats()
select {
case <-reporter.daemon:
case <-time.After(time.Second):
t.Fatal("StopHeartbeats must stop the daemon loop")
}
time.Sleep(3 * cfg.Runner.StateReportInterval)
assert.Equal(t, beforeStop, updateTaskCalls.Load(),
"UpdateTask must not be called after StopHeartbeats")
require.NoError(t, reporter.Close(""))
assert.Greater(t, updateTaskCalls.Load(), beforeStop,
"Close() must still send a final UpdateTask after StopHeartbeats")
}