fix: deliver cancel ack and reap leftover Windows job processes (#996)

## Summary - When Gitea cancels a job, the reporter cancels its own task context; the final Close() flush then aborted on that same cancelled context and Gitea never received the runner's acknowledgement (missing tail logs and final state). - On Windows the cancelled context also neutralised terminateRunningProcesses, leaving step grandchildren alive in the workspace, holding file handles, so the runner could no longer clean up and pick up new work. - Reporter.Close() now flushes on a detached, bounded context via a new rpcCtx() helper and configurable Runner.ReportCloseTimeout (default 10s). - terminateRunningProcesses now PowerShell-enumerates Win32_Process and taskkill /T /F's every process whose ExecutablePath or CommandLine references the job's workspace directories, on a detached context. - The daemon heartbeat loop still exits on <-r.ctx.Done(): the runner is intentionally seen as offline by Gitea during cleanup so it isn't handed a new task overlapping the in-progress teardown. ## Test plan - [x] go test ./internal/pkg/report/... ./act/container/ -run 'TestReporter_ServerCancelStillFlushesFinal|TestBuildWindowsWorkspaceKillScript' - [x] make fmt && make lint-go - 0 issues - [x] GOOS=windows go build ./... - clean - [x] Manual on a Windows runner: trigger a long-running workflow, cancel from Gitea UI; verify (a) the job ends with tail logs + cancelled state in Gitea, (b) workspace cleans up, (c) the runner picks up a new job without restart. Authored-by: bircni 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: silverwind <me@silverwind.io> Reviewed-on: https://gitea.com/gitea/runner/pulls/996 Reviewed-by: silverwind <2021+silverwind@noreply.gitea.com>
2026-06-10 11:44:30 +02:00 · 2026-05-24 10:01:01 +00:00
parent 273f6b4247
commit 0b9f251b6a
6 changed files with 273 additions and 47 deletions
--- a/internal/pkg/config/config.example.yaml
+++ b/internal/pkg/config/config.example.yaml
@@ -60,6 +60,9 @@ runner:
  # The interval for reporting task state (step status, timing) to the Gitea instance.
  # State is also reported immediately on step transitions (start/stop).
  state_report_interval: 5s
+  # Per-attempt deadline for flushing the final logs and task state when a job
+  # finishes, on a detached context so a server cancel can't block the acknowledgement.
+  report_close_timeout: 10s
  # The github_mirror of a runner is used to specify the mirror address of the github that pulls the action repository.
  # It works when something like `uses: actions/checkout@v4` is used and DEFAULT_ACTIONS_URL is set to github,
  # and github_mirror is not empty. In this case,
--- a/internal/pkg/config/config.go
+++ b/internal/pkg/config/config.go
@@ -39,6 +39,7 @@ type Runner struct {
 	LogReportMaxLatency time.Duration     `yaml:"log_report_max_latency"` // LogReportMaxLatency specifies the max time a log row can wait before being sent.
 	LogReportBatchSize  int               `yaml:"log_report_batch_size"`  // LogReportBatchSize triggers immediate log flush when buffer reaches this size.
 	StateReportInterval time.Duration     `yaml:"state_report_interval"`  // StateReportInterval specifies the interval for state reporting.
+	ReportCloseTimeout  time.Duration     `yaml:"report_close_timeout"`   // ReportCloseTimeout caps each RPC attempt when flushing the final logs and task state at job completion, on a detached context so a server cancel can't block the acknowledgement.
 	Labels              []string          `yaml:"labels"`                 // Labels specify the labels of the runner. Labels are declared on each startup
 	GithubMirror        string            `yaml:"github_mirror"`          // GithubMirror defines what mirrors should be used when using github
 	AllocatePTY         bool              `yaml:"allocate_pty"`           // AllocatePTY allocates a pseudo-TTY for each step's process. Default is false, matching GitHub's actions/runner. Enable only for jobs that need an interactive terminal; tools like docker build emit redrawing progress frames into the captured log when a TTY is present. Applies to both host and docker backends.
@@ -183,6 +184,9 @@ func LoadDefault(file string) (*Config, error) {
 	if cfg.Runner.StateReportInterval <= 0 {
 		cfg.Runner.StateReportInterval = 5 * time.Second
 	}
+	if cfg.Runner.ReportCloseTimeout <= 0 {
+		cfg.Runner.ReportCloseTimeout = 10 * time.Second
+	}
 	if cfg.Metrics.Addr == "" {
 		cfg.Metrics.Addr = "127.0.0.1:9101"
 	}
--- a/internal/pkg/report/reporter.go
+++ b/internal/pkg/report/reporter.go
@@ -58,6 +58,9 @@ type Reporter struct {
 	logReportMaxLatency time.Duration
 	logBatchSize        int
 	stateReportInterval time.Duration
+	// closeTimeout bounds each RPC attempt in the final flush, on a context
+	// detached from r.ctx so a server cancel can't abort the acknowledgement.
+	closeTimeout time.Duration

 	// Event notification channels (non-blocking, buffered 1)
 	logNotify   chan struct{} // signal: new log rows arrived
@@ -89,6 +92,7 @@ func NewReporter(ctx context.Context, cancel context.CancelFunc, client client.C
 		logReportMaxLatency: cfg.Runner.LogReportMaxLatency,
 		logBatchSize:        cfg.Runner.LogReportBatchSize,
 		stateReportInterval: cfg.Runner.StateReportInterval,
+		closeTimeout:        cfg.Runner.ReportCloseTimeout,
 		logNotify:           make(chan struct{}, 1),
 		stateNotify:         make(chan struct{}, 1),
 		state: &runnerv1.TaskState{
@@ -329,6 +333,9 @@ func (r *Reporter) runDaemonLoop() {
 			_ = r.ReportLog(false)

 		case <-r.ctx.Done():
+			// Stop heartbeating on cancel so Gitea sees the runner as offline
+			// during cleanup and won't assign an overlapping task. Close() still
+			// delivers the final flush on a detached context (flushFinal).
 			close(r.daemon)
 			return
 		}
@@ -431,17 +438,43 @@ func (r *Reporter) Close(lastWords string) error {
 	}
 	r.stateMu.Unlock()

-	// Report the job outcome even when all log upload retry attempts have been exhausted
+	// Separate budgets so a slow ReportLog can't starve the ReportState that
+	// carries the cancel acknowledgement.
 	return errors.Join(
-		retry.New(retry.Context(r.ctx)).Do(func() error {
-			return r.ReportLog(true)
-		}),
-		retry.New(retry.Context(r.ctx)).Do(func() error {
-			return r.ReportState(true)
-		}),
+		r.flushFinal(func() error { return r.ReportLog(true) }),
+		r.flushFinal(func() error { return r.ReportState(true) }),
 	)
 }

+// flushFinal retries fn on a detached, bounded context so a cancelled r.ctx
+// does not abort the final flush. Each call gets its own fresh budget.
+func (r *Reporter) flushFinal(fn func() error) error {
+	ctx, cancel := context.WithTimeout(context.Background(), 3*r.effectiveCloseTimeout())
+	defer cancel()
+	return retry.New(retry.Context(ctx)).Do(fn)
+}
+
+// effectiveCloseTimeout returns closeTimeout, or 10s when unset, so a zero
+// value can't produce an already-expired context for the final flush.
+func (r *Reporter) effectiveCloseTimeout() time.Duration {
+	if r.closeTimeout <= 0 {
+		return 10 * time.Second
+	}
+	return r.closeTimeout
+}
+
+// rpcCtx returns the context for an outbound RPC plus a cancel func. While
+// r.ctx is alive it's used directly; once cancelled (server RESULT_CANCELLED),
+// RPCs switch to a fresh bounded context so Close()'s final flush still lands.
+func (r *Reporter) rpcCtx() (context.Context, context.CancelFunc) {
+	select {
+	case <-r.ctx.Done():
+		return context.WithTimeout(context.Background(), r.effectiveCloseTimeout())
+	default:
+		return r.ctx, func() {}
+	}
+}
+
 func (r *Reporter) ReportLog(noMore bool) error {
 	r.clientM.Lock()
 	defer r.clientM.Unlock()
@@ -454,8 +487,11 @@ func (r *Reporter) ReportLog(noMore bool) error {
 		return nil
 	}

+	rpcCtx, rpcCancel := r.rpcCtx()
+	defer rpcCancel()
+
 	start := time.Now()
-	resp, err := r.client.UpdateLog(r.ctx, connect.NewRequest(&runnerv1.UpdateLogRequest{
+	resp, err := r.client.UpdateLog(rpcCtx, connect.NewRequest(&runnerv1.UpdateLogRequest{
 		TaskId: r.state.Id,
 		Index:  int64(r.logOffset),
 		Rows:   rows,
@@ -526,8 +562,11 @@ func (r *Reporter) ReportState(reportResult bool) error {
 		state.Result = runnerv1.Result_RESULT_UNSPECIFIED
 	}

+	rpcCtx, rpcCancel := r.rpcCtx()
+	defer rpcCancel()
+
 	start := time.Now()
-	resp, err := r.client.UpdateTask(r.ctx, connect.NewRequest(&runnerv1.UpdateTaskRequest{
+	resp, err := r.client.UpdateTask(rpcCtx, connect.NewRequest(&runnerv1.UpdateTaskRequest{
 		State:   state,
 		Outputs: outputs,
 	}))
--- a/internal/pkg/report/reporter_test.go
+++ b/internal/pkg/report/reporter_test.go
@@ -754,3 +754,86 @@ func TestReporter_StateHeartbeat(t *testing.T) {
 	require.NoError(t, reporter.ReportState(false))
 	assert.Equal(t, int64(2), updateTaskCalls.Load(), "ReportState must heartbeat after stateReportInterval even with no state change")
 }
+
+// TestReporter_ServerCancelStillFlushesFinal asserts that when the Gitea server
+// returns RESULT_CANCELLED on an in-flight UpdateTask (which causes the
+// reporter to cancel the task context), Close() still successfully sends the
+// final UpdateLog{NoMore:true} and the final UpdateTask carrying the populated
+// final state. Before the fix this final flush used r.ctx, which was just
+// cancelled, so retry-go aborted on its context check and Gitea never received
+// the runner's acknowledgement of the cancel.
+func TestReporter_ServerCancelStillFlushesFinal(t *testing.T) {
+	var (
+		updateTaskCalls    atomic.Int64
+		finalLogNoMoreSeen atomic.Bool
+		finalTaskStateSeen atomic.Bool
+	)
+
+	client := mocks.NewClient(t)
+	client.On("UpdateLog", mock.Anything, mock.Anything).Return(
+		func(_ context.Context, req *connect_go.Request[runnerv1.UpdateLogRequest]) (*connect_go.Response[runnerv1.UpdateLogResponse], error) {
+			if req.Msg.NoMore {
+				finalLogNoMoreSeen.Store(true)
+			}
+			return connect_go.NewResponse(&runnerv1.UpdateLogResponse{
+				AckIndex: req.Msg.Index + int64(len(req.Msg.Rows)),
+			}), nil
+		},
+	)
+	// The first UpdateTask returns RESULT_CANCELLED — modelling a server-side
+	// cancellation; the reporter must call r.cancel() in response. The final
+	// UpdateTask issued by Close() must still arrive even though r.ctx is now
+	// cancelled.
+	client.On("UpdateTask", mock.Anything, mock.Anything).Return(
+		func(_ context.Context, req *connect_go.Request[runnerv1.UpdateTaskRequest]) (*connect_go.Response[runnerv1.UpdateTaskResponse], error) {
+			n := updateTaskCalls.Add(1)
+			if n == 1 {
+				return connect_go.NewResponse(&runnerv1.UpdateTaskResponse{
+					State: &runnerv1.TaskState{
+						Result: runnerv1.Result_RESULT_CANCELLED,
+					},
+				}), nil
+			}
+			if req.Msg.State != nil && req.Msg.State.Result != runnerv1.Result_RESULT_UNSPECIFIED {
+				finalTaskStateSeen.Store(true)
+			}
+			return connect_go.NewResponse(&runnerv1.UpdateTaskResponse{}), nil
+		},
+	)
+
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+	taskCtx, err := structpb.NewStruct(map[string]any{})
+	require.NoError(t, err)
+	cfg, _ := config.LoadDefault("")
+	reporter := NewReporter(ctx, cancel, client, &runnerv1.Task{Context: taskCtx}, cfg)
+	reporter.ResetSteps(1)
+
+	// Force the first ReportState to actually call UpdateTask.
+	reporter.stateMu.Lock()
+	reporter.stateChanged = true
+	reporter.stateMu.Unlock()
+
+	// First ReportState — server returns RESULT_CANCELLED, reporter cancels r.ctx.
+	require.NoError(t, reporter.ReportState(false))
+	require.Equal(t, int64(1), updateTaskCalls.Load())
+
+	select {
+	case <-ctx.Done():
+		// Expected: reporter called cancel() because the server reported the task as cancelled.
+	case <-time.After(time.Second):
+		t.Fatal("expected r.ctx to be cancelled after server returned RESULT_CANCELLED")
+	}
+
+	// The test does not start the daemon goroutine; close(r.daemon) so Close()
+	// proceeds without waiting on its 60s timeout.
+	close(reporter.daemon)
+
+	// Now Close() runs. Before the fix, both final RPCs aborted on the cancelled
+	// r.ctx via retry.Context. After the fix, Close() uses a detached context and
+	// the per-RPC rpcCtx() falls back to a fresh ctx, so both calls succeed.
+	require.NoError(t, reporter.Close("cancelled"))
+
+	assert.True(t, finalLogNoMoreSeen.Load(), "Close() must send a final UpdateLog{NoMore:true} even after server-side cancellation")
+	assert.True(t, finalTaskStateSeen.Load(), "Close() must send a final UpdateTask with the populated final state even after server-side cancellation")
+}