feat: Enable jobs.<job_id>.timeout-minutes and jobs.<job_id>.continue-on-error (#1032)

Two `jobs.<job_id>` workflow syntax fields were parsed from YAML but silently ignored. This PR implements both: - **`jobs.<job_id>.timeout-minutes`** — applies a context deadline around the entire job execution (container start, pre-steps, main steps, post-steps). Mirrors the existing step-level `evaluateStepTimeout`. Supports expression interpolation (e.g. `${{ env.MY_TIMEOUT }}`). - **`jobs.<job_id>.continue-on-error`** — evaluates the expression when a job fails. If all failing matrix combinations had `continue-on-error: true`, the job does not cause the workflow run to fail (`handleFailure` skips it), and the tolerated failure reports `success` to dependent jobs through the `needs` context so jobs gated on the default `if: success()` still run (matching GitHub). The "any firm failure wins" rule is serialised under the existing per-job lock, so parallel matrix combinations are safe. Both features follow the same patterns already used at the step level (`evaluateStepTimeout` / `isContinueOnError` in `act/runner/step.go`). ## Version compatibility These changes are backward compatible. With mismatched versions the feature degrades silently to the previous behaviour (field ignored) — no errors on either side. - `timeout-minutes`: runner-only, no server dependency. - `continue-on-error`: requires both this runner PR and the matching Gitea server PR to take full effect. With only one side updated, the field continues to be ignored. Related: [Github](https://github.com/go-gitea/gitea/pull/38100) --------- Co-authored-by: silverwind <2021+silverwind@noreply.gitea.com> Co-authored-by: silverwind <me@silverwind.io> Reviewed-on: https://gitea.com/gitea/runner/pulls/1032 Reviewed-by: silverwind <2021+silverwind@noreply.gitea.com> Reviewed-by: Zettat123 <39446+zettat123@noreply.gitea.com>
2026-06-22 09:44:24 +02:00 · 2026-06-21 17:05:36 +00:00
parent 007717956a
commit 6bdcb54828
7 changed files with 355 additions and 24 deletions
--- a/act/runner/job_executor_test.go
+++ b/act/runner/job_executor_test.go
@@ -28,6 +28,7 @@ import (
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/mock"
 	"github.com/stretchr/testify/require"
+	yaml "go.yaml.in/yaml/v4"
 )

 func TestJobExecutor(t *testing.T) {
@@ -347,6 +348,133 @@ func TestNewJobExecutor(t *testing.T) {
 	}
 }

+// TestNewJobExecutorRunsPostStepsAfterTimeout guards the timeout-minutes cleanup
+// path: when a job exceeds its timeout the job context is DeadlineExceeded, but
+// the post steps (cleanup hooks like actions/checkout post and cache save) must
+// still run against a fresh, non-expired context, and the job must still be
+// reported as failed.
+func TestNewJobExecutorRunsPostStepsAfterTimeout(t *testing.T) {
+	ctx := common.WithJobErrorContainer(context.Background())
+	// The timeout is generous so the main step (which blocks on ctx.Done below) is
+	// always reached before the deadline fires; otherwise the pipeline would
+	// short-circuit before the step runs and the job error would never be set.
+	ctx, cancel := context.WithTimeout(ctx, 200*time.Millisecond)
+	defer cancel()
+
+	jim := &jobInfoMock{}
+	sfm := &stepFactoryMock{}
+	rc := &RunContext{
+		JobContainer: &jobContainerMock{},
+		Run: &model.Run{
+			JobID: "test",
+			Workflow: &model.Workflow{
+				Jobs: map[string]*model.Job{
+					"test": {},
+				},
+			},
+		},
+		Config: &Config{},
+	}
+	rc.ExprEval = rc.NewExpressionEvaluator(ctx)
+
+	stepModel := &model.Step{ID: "1"}
+	jim.On("steps").Return([]*model.Step{stepModel})
+	jim.On("matrix").Return(map[string]any{})
+	jim.On("startContainer").Return(func(ctx context.Context) error { return nil })
+	jim.On("interpolateOutputs").Return(func(ctx context.Context) error { return nil })
+	jim.On("closeContainer").Return(func(ctx context.Context) error { return nil })
+	// The job timed out, so it must be reported as failed. stopContainer is left
+	// unexpected on purpose: a timed-out (failed) job preserves its error state, so
+	// the graceful stop is skipped exactly like any other failure without AutoRemove.
+	jim.On("result", "failure")
+
+	sm := &stepMock{}
+	sfm.On("newStep", stepModel, rc).Return(sm, nil)
+	sm.On("pre").Return(func(ctx context.Context) error { return nil })
+	// The main step runs past the job timeout: it blocks until the job context is
+	// done, mirroring a step that overruns timeout-minutes.
+	sm.On("main").Return(func(ctx context.Context) error {
+		<-ctx.Done()
+		return ctx.Err()
+	})
+
+	var postRan bool
+	var postCtxErr error
+	sm.On("post").Return(func(ctx context.Context) error {
+		postRan = true
+		postCtxErr = ctx.Err()
+		return nil
+	})
+
+	executor := newJobExecutor(jim, sfm, rc)
+	// The executor itself returns nil on timeout: the failure is surfaced through
+	// the job result ("failure", asserted via the result mock below), not the
+	// return value.
+	require.NoError(t, executor(ctx))
+
+	assert.True(t, postRan, "post step must run after a job timeout")
+	require.NoError(t, postCtxErr, "post step must run against a fresh, non-expired context")
+
+	jim.AssertExpectations(t)
+	sfm.AssertExpectations(t)
+	sm.AssertExpectations(t)
+}
+
+// TestSetJobResultMatrixContinueOnError exercises the parallel-matrix path
+// end-to-end: two combinations share one *model.Job and continue-on-error is
+// keyed on matrix.experimental, so one combination tolerates its failure and the
+// other does not. The job is reported as continue-on-error only when EVERY failing
+// combination was tolerated; a single firm failure makes the whole job firm, and
+// handleFailure then fails the run.
+func TestSetJobResultMatrixContinueOnError(t *testing.T) {
+	const jobYAML = "continue-on-error: ${{ matrix.experimental }}\nruns-on: ubuntu-latest"
+
+	newSharedJob := func(t *testing.T) (*model.Job, *model.Workflow) {
+		t.Helper()
+		var job *model.Job
+		require.NoError(t, yaml.Unmarshal([]byte(jobYAML), &job))
+		return job, &model.Workflow{
+			Name: "workflow1",
+			Jobs: map[string]*model.Job{"job1": job},
+		}
+	}
+
+	planFor := func(wf *model.Workflow) *model.Plan {
+		return &model.Plan{Stages: []*model.Stage{{Runs: []*model.Run{{Workflow: wf, JobID: "job1"}}}}}
+	}
+
+	ctx := context.Background()
+
+	// fail drives a single matrix combination through the failure path; each
+	// RunContext is its own jobInfo (rc implements jobInfo) and shares the job.
+	fail := func(wf *model.Workflow, experimental bool) {
+		rc := newTestRC(wf, map[string]any{"experimental": experimental})
+		setJobResult(ctx, rc, rc, false)
+	}
+
+	t.Run("one tolerated and one firm failure fails the run", func(t *testing.T) {
+		job, wf := newSharedJob(t)
+		// Order is intentional: the tolerated combination finishes first, then the
+		// firm one. The firm-failure latch must still win regardless of order.
+		fail(wf, true)
+		fail(wf, false)
+
+		assert.Equal(t, "failure", job.Result)
+		assert.False(t, job.ContinueOnError, "a single firm failure must make the whole job firm")
+		assert.Error(t, handleFailure(planFor(wf))(ctx))
+	})
+
+	t.Run("all tolerated failures do not fail the run", func(t *testing.T) {
+		job, wf := newSharedJob(t)
+		fail(wf, true)
+		fail(wf, true)
+
+		assert.Equal(t, "failure", job.Result)
+		assert.True(t, job.ContinueOnError, "every failing combination was tolerated")
+		assert.NoError(t, handleFailure(planFor(wf))(ctx))
+	})
+}
+
 func TestHasJobSummaryCapability(t *testing.T) {
 	assert.True(t, hasJobSummaryCapability("cache,job-summary artifacts"))
 	assert.True(t, hasJobSummaryCapability("cache,\njob-summary\tartifacts"))
@@ -674,3 +802,104 @@ func tarArchive(t *testing.T, entries ...tarEntry) []byte {
 	require.NoError(t, tw.Close())
 	return buf.Bytes()
 }
+
+func newTestRC(wf *model.Workflow, matrix map[string]any) *RunContext {
+	return &RunContext{
+		Config: &Config{
+			Workdir: ".",
+			Platforms: map[string]string{
+				"ubuntu-latest": "ubuntu-latest",
+			},
+		},
+		StepResults: map[string]*model.StepResult{},
+		Env:         map[string]string{},
+		Matrix:      matrix,
+		Run:         &model.Run{JobID: "job1", Workflow: wf},
+	}
+}
+
+func makeTestRC(t *testing.T, jobYAML string) *RunContext {
+	t.Helper()
+	var job *model.Job
+	require.NoError(t, yaml.Unmarshal([]byte(jobYAML), &job))
+	rc := newTestRC(&model.Workflow{
+		Name: "workflow1",
+		Jobs: map[string]*model.Job{"job1": job},
+	}, nil)
+	rc.ExprEval = rc.NewExpressionEvaluator(context.Background())
+	return rc
+}
+
+func TestApplyJobTimeout(t *testing.T) {
+	cases := []struct {
+		name        string
+		yaml        string
+		wantTimeout bool
+	}{
+		{"empty", "runs-on: ubuntu-latest", false},
+		{"integer", "timeout-minutes: 5\nruns-on: ubuntu-latest", true},
+		{"non-numeric ignored", "timeout-minutes: abc\nruns-on: ubuntu-latest", false},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			rc := makeTestRC(t, tc.yaml)
+			ctx := context.Background()
+			newCtx, cancel := applyJobTimeout(ctx, rc, rc.Run.Job())
+			defer cancel()
+			_, hasDeadline := newCtx.Deadline()
+			assert.Equal(t, tc.wantTimeout, hasDeadline)
+		})
+	}
+}
+
+func TestEvaluateJobContinueOnError(t *testing.T) {
+	cases := []struct {
+		name string
+		yaml string
+		want bool
+	}{
+		{"absent", "runs-on: ubuntu-latest", false},
+		{"true", "continue-on-error: true\nruns-on: ubuntu-latest", true},
+		{"false", "continue-on-error: false\nruns-on: ubuntu-latest", false},
+		{"expression true", "continue-on-error: ${{ 'x' == 'x' }}\nruns-on: ubuntu-latest", true},
+		{"expression false", "continue-on-error: ${{ 'x' != 'x' }}\nruns-on: ubuntu-latest", false},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			rc := makeTestRC(t, tc.yaml)
+			got := evaluateJobContinueOnError(context.Background(), rc, rc.Run.Job())
+			assert.Equal(t, tc.want, got)
+		})
+	}
+}
+
+func TestJobSetContinueOnError(t *testing.T) {
+	t.Run("first call true", func(t *testing.T) {
+		j := &model.Job{}
+		j.SetContinueOnError(true)
+		assert.True(t, j.ContinueOnError)
+	})
+	t.Run("first call false", func(t *testing.T) {
+		j := &model.Job{}
+		j.SetContinueOnError(false)
+		assert.False(t, j.ContinueOnError)
+	})
+	t.Run("true then false locks to false", func(t *testing.T) {
+		j := &model.Job{}
+		j.SetContinueOnError(true)
+		j.SetContinueOnError(false)
+		assert.False(t, j.ContinueOnError)
+	})
+	t.Run("false then true stays false", func(t *testing.T) {
+		j := &model.Job{}
+		j.SetContinueOnError(false)
+		j.SetContinueOnError(true)
+		assert.False(t, j.ContinueOnError)
+	})
+	t.Run("true then true stays true", func(t *testing.T) {
+		j := &model.Job{}
+		j.SetContinueOnError(true)
+		j.SetContinueOnError(true)
+		assert.True(t, j.ContinueOnError)
+	})
+}