mirror of
https://gitea.com/gitea/act_runner.git
synced 2026-06-10 02:54:23 +02:00
fix: deliver cancel ack and reap leftover Windows job processes (#996)
## Summary - When Gitea cancels a job, the reporter cancels its own task context; the final Close() flush then aborted on that same cancelled context and Gitea never received the runner's acknowledgement (missing tail logs and final state). - On Windows the cancelled context also neutralised terminateRunningProcesses, leaving step grandchildren alive in the workspace, holding file handles, so the runner could no longer clean up and pick up new work. - Reporter.Close() now flushes on a detached, bounded context via a new rpcCtx() helper and configurable Runner.ReportCloseTimeout (default 10s). - terminateRunningProcesses now PowerShell-enumerates Win32_Process and taskkill /T /F's every process whose ExecutablePath or CommandLine references the job's workspace directories, on a detached context. - The daemon heartbeat loop still exits on <-r.ctx.Done(): the runner is intentionally seen as offline by Gitea during cleanup so it isn't handed a new task overlapping the in-progress teardown. ## Test plan - [x] go test ./internal/pkg/report/... ./act/container/ -run 'TestReporter_ServerCancelStillFlushesFinal|TestBuildWindowsWorkspaceKillScript' - [x] make fmt && make lint-go - 0 issues - [x] GOOS=windows go build ./... - clean - [x] Manual on a Windows runner: trigger a long-running workflow, cancel from Gitea UI; verify (a) the job ends with tail logs + cancelled state in Gitea, (b) workspace cleans up, (c) the runner picks up a new job without restart. Authored-by: bircni 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: silverwind <me@silverwind.io> Reviewed-on: https://gitea.com/gitea/runner/pulls/996 Reviewed-by: silverwind <2021+silverwind@noreply.gitea.com>
This commit is contained in:
@@ -16,9 +16,7 @@ import (
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
@@ -44,9 +42,6 @@ type HostEnvironment struct {
|
||||
CleanUp func()
|
||||
StdOut io.Writer
|
||||
AllocatePTY bool // allocate a pseudo-TTY for each step's process
|
||||
|
||||
mu sync.Mutex
|
||||
runningPIDs map[int]struct{}
|
||||
}
|
||||
|
||||
func (e *HostEnvironment) Create(_, _ []string) common.Executor {
|
||||
@@ -353,24 +348,9 @@ func (e *HostEnvironment) exec(ctx context.Context, command []string, cmdline st
|
||||
go copyPtyOutput(writer, ppty, finishLog)
|
||||
go writeKeepAlive(ppty)
|
||||
}
|
||||
// Split Start/Wait so the PID can be registered before the process can exit;
|
||||
// cmd.Run() would block until exit, by which time the PID may have been reused.
|
||||
if err := cmd.Start(); err != nil {
|
||||
return err
|
||||
}
|
||||
if cmd.Process != nil {
|
||||
e.mu.Lock()
|
||||
if e.runningPIDs == nil {
|
||||
e.runningPIDs = map[int]struct{}{}
|
||||
}
|
||||
e.runningPIDs[cmd.Process.Pid] = struct{}{}
|
||||
e.mu.Unlock()
|
||||
defer func(pid int) {
|
||||
e.mu.Lock()
|
||||
delete(e.runningPIDs, pid)
|
||||
e.mu.Unlock()
|
||||
}(cmd.Process.Pid)
|
||||
}
|
||||
err = cmd.Wait()
|
||||
if err != nil {
|
||||
var exitErr *exec.ExitError
|
||||
@@ -440,30 +420,80 @@ func removePathWithRetry(ctx context.Context, path string) error {
|
||||
return lastErr
|
||||
}
|
||||
|
||||
// buildWindowsWorkspaceKillScript builds a PowerShell command that `taskkill
|
||||
// /T /F`s every process tree whose ExecutablePath or CommandLine references one
|
||||
// of the given absolute workspace dirs, releasing file handles for cleanup.
|
||||
//
|
||||
// Win32_Process is used because it exposes both ExecutablePath and CommandLine
|
||||
// (Get-Process doesn't, wmic is deprecated). Both match the dir+separator
|
||||
// prefix, so a sibling dir sharing a name prefix (job1 vs job10) is spared.
|
||||
// Ordinal String methods, not -like, so path metacharacters ([ ] ? *) stay
|
||||
// literal.
|
||||
//
|
||||
// Pure function so the quote-escaping can be unit-tested without PowerShell.
|
||||
func buildWindowsWorkspaceKillScript(dirs []string) string {
|
||||
quoted := make([]string, len(dirs))
|
||||
for i, d := range dirs {
|
||||
// Single-quoted PowerShell literal; escape ' by doubling it.
|
||||
quoted[i] = "'" + strings.ReplaceAll(d, "'", "''") + "'"
|
||||
}
|
||||
|
||||
return `$paths = @(` + strings.Join(quoted, ",") + `)
|
||||
$selfPid = $PID
|
||||
Get-CimInstance Win32_Process -ErrorAction SilentlyContinue | Where-Object {
|
||||
if ($_.ProcessId -eq $selfPid) { return $false }
|
||||
foreach ($p in $paths) {
|
||||
$prefix = $p + '\'
|
||||
if ($_.ExecutablePath -and $_.ExecutablePath.StartsWith($prefix, [System.StringComparison]::OrdinalIgnoreCase)) { return $true }
|
||||
if ($_.CommandLine -and $_.CommandLine.IndexOf($prefix, [System.StringComparison]::OrdinalIgnoreCase) -ge 0) { return $true }
|
||||
}
|
||||
return $false
|
||||
} | ForEach-Object {
|
||||
& taskkill.exe /PID $_.ProcessId /T /F 2>$null | Out-Null
|
||||
}
|
||||
`
|
||||
}
|
||||
|
||||
func (e *HostEnvironment) terminateRunningProcesses(ctx context.Context) {
|
||||
if runtime.GOOS != "windows" {
|
||||
return
|
||||
}
|
||||
e.mu.Lock()
|
||||
pids := make([]int, 0, len(e.runningPIDs))
|
||||
for pid := range e.runningPIDs {
|
||||
pids = append(pids, pid)
|
||||
}
|
||||
e.mu.Unlock()
|
||||
|
||||
if len(pids) == 0 {
|
||||
// Detached: exec.CommandContext won't start on a cancelled ctx, and a
|
||||
// server cancel has already cancelled the parent ctx.
|
||||
killCtx, killCancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer killCancel()
|
||||
|
||||
logger := common.Logger(ctx)
|
||||
|
||||
// Workspace dirs we own. Any process running from or referencing one is a
|
||||
// leftover job process. ToolCache is shared across jobs; Workdir only when
|
||||
// we own it (else it's a caller-provided checkout, e.g. act local mode).
|
||||
owned := []string{e.Path, e.TmpDir}
|
||||
if e.CleanWorkdir {
|
||||
owned = append(owned, e.Workdir)
|
||||
}
|
||||
dirs := make([]string, 0, len(owned))
|
||||
for _, d := range owned {
|
||||
if d == "" {
|
||||
continue
|
||||
}
|
||||
abs, err := filepath.Abs(d)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
dirs = append(dirs, abs)
|
||||
}
|
||||
if len(dirs) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
logger := common.Logger(ctx)
|
||||
for _, pid := range pids {
|
||||
// Best-effort: forcibly terminate process tree to release file handles
|
||||
// so that workspace cleanup can succeed on Windows.
|
||||
cmd := exec.CommandContext(ctx, "taskkill", "/PID", strconv.Itoa(pid), "/T", "/F")
|
||||
out, err := cmd.CombinedOutput()
|
||||
if err != nil {
|
||||
logger.Debugf("taskkill failed for pid=%d: %v output=%s", pid, err, strings.TrimSpace(string(out)))
|
||||
}
|
||||
script := buildWindowsWorkspaceKillScript(dirs)
|
||||
|
||||
cmd := exec.CommandContext(killCtx, "powershell.exe", "-NoProfile", "-NonInteractive", "-Command", script)
|
||||
out, err := cmd.CombinedOutput()
|
||||
if err != nil {
|
||||
logger.Debugf("workspace process-tree kill via PowerShell failed: %v output=%s", err, strings.TrimSpace(string(out)))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -477,14 +507,20 @@ func (e *HostEnvironment) Remove() common.Executor {
|
||||
if e.CleanUp != nil {
|
||||
e.CleanUp()
|
||||
}
|
||||
|
||||
// Detach: a cancelled ctx would skip removePathWithRetry's retries,
|
||||
// which absorb Windows file-handle release lag after the kill above.
|
||||
rmCtx, rmCancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer rmCancel()
|
||||
|
||||
logger := common.Logger(ctx)
|
||||
var errs []error
|
||||
if err := removePathWithRetry(ctx, e.Path); err != nil {
|
||||
if err := removePathWithRetry(rmCtx, e.Path); err != nil {
|
||||
logger.Warnf("failed to remove host misc state %s: %v", e.Path, err)
|
||||
errs = append(errs, err)
|
||||
}
|
||||
if e.CleanWorkdir {
|
||||
if err := removePathWithRetry(ctx, e.Workdir); err != nil {
|
||||
if err := removePathWithRetry(rmCtx, e.Workdir); err != nil {
|
||||
logger.Warnf("failed to remove host workspace %s: %v", e.Workdir, err)
|
||||
errs = append(errs, err)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user