feat: Enable jobs.<job_id>.timeout-minutes and jobs.<job_id>.continue-on-error (#1032)

Two `jobs.<job_id>` workflow syntax fields were parsed from YAML but silently ignored. This PR implements both:

- **`jobs.<job_id>.timeout-minutes`** — applies a context deadline around the entire job execution (container start, pre-steps, main steps, post-steps). Mirrors the existing step-level `evaluateStepTimeout`. Supports expression interpolation (e.g. `${{ env.MY_TIMEOUT }}`).

- **`jobs.<job_id>.continue-on-error`** — evaluates the expression when a job fails. If all failing matrix combinations had `continue-on-error: true`, the job does not cause the workflow run to fail (`handleFailure` skips it), and the tolerated failure reports `success` to dependent jobs through the `needs` context so jobs gated on the default `if: success()` still run (matching GitHub). The "any firm failure wins" rule is serialised under the existing per-job lock, so parallel matrix combinations are safe.

Both features follow the same patterns already used at the step level (`evaluateStepTimeout` / `isContinueOnError` in `act/runner/step.go`).

## Version compatibility

These changes are backward compatible. With mismatched versions the feature degrades silently to the previous behaviour (field ignored) — no errors on either side.

- `timeout-minutes`: runner-only, no server dependency.
- `continue-on-error`: requires both this runner PR and the matching Gitea server PR to take full effect. With only one side updated, the field continues to be ignored.

Related: [Github](https://github.com/go-gitea/gitea/pull/38100)
---------

Co-authored-by: silverwind <2021+silverwind@noreply.gitea.com>
Co-authored-by: silverwind <me@silverwind.io>
Reviewed-on: https://gitea.com/gitea/runner/pulls/1032
Reviewed-by: silverwind <2021+silverwind@noreply.gitea.com>
Reviewed-by: Zettat123 <39446+zettat123@noreply.gitea.com>
This commit is contained in:
Nicolas
2026-06-21 17:05:36 +00:00
parent 007717956a
commit 6bdcb54828
7 changed files with 355 additions and 24 deletions

View File

@@ -28,6 +28,7 @@ import (
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/mock"
"github.com/stretchr/testify/require"
yaml "go.yaml.in/yaml/v4"
)
func TestJobExecutor(t *testing.T) {
@@ -347,6 +348,133 @@ func TestNewJobExecutor(t *testing.T) {
}
}
// TestNewJobExecutorRunsPostStepsAfterTimeout guards the timeout-minutes cleanup
// path: when a job exceeds its timeout the job context is DeadlineExceeded, but
// the post steps (cleanup hooks like actions/checkout post and cache save) must
// still run against a fresh, non-expired context, and the job must still be
// reported as failed.
func TestNewJobExecutorRunsPostStepsAfterTimeout(t *testing.T) {
ctx := common.WithJobErrorContainer(context.Background())
// The timeout is generous so the main step (which blocks on ctx.Done below) is
// always reached before the deadline fires; otherwise the pipeline would
// short-circuit before the step runs and the job error would never be set.
ctx, cancel := context.WithTimeout(ctx, 200*time.Millisecond)
defer cancel()
jim := &jobInfoMock{}
sfm := &stepFactoryMock{}
rc := &RunContext{
JobContainer: &jobContainerMock{},
Run: &model.Run{
JobID: "test",
Workflow: &model.Workflow{
Jobs: map[string]*model.Job{
"test": {},
},
},
},
Config: &Config{},
}
rc.ExprEval = rc.NewExpressionEvaluator(ctx)
stepModel := &model.Step{ID: "1"}
jim.On("steps").Return([]*model.Step{stepModel})
jim.On("matrix").Return(map[string]any{})
jim.On("startContainer").Return(func(ctx context.Context) error { return nil })
jim.On("interpolateOutputs").Return(func(ctx context.Context) error { return nil })
jim.On("closeContainer").Return(func(ctx context.Context) error { return nil })
// The job timed out, so it must be reported as failed. stopContainer is left
// unexpected on purpose: a timed-out (failed) job preserves its error state, so
// the graceful stop is skipped exactly like any other failure without AutoRemove.
jim.On("result", "failure")
sm := &stepMock{}
sfm.On("newStep", stepModel, rc).Return(sm, nil)
sm.On("pre").Return(func(ctx context.Context) error { return nil })
// The main step runs past the job timeout: it blocks until the job context is
// done, mirroring a step that overruns timeout-minutes.
sm.On("main").Return(func(ctx context.Context) error {
<-ctx.Done()
return ctx.Err()
})
var postRan bool
var postCtxErr error
sm.On("post").Return(func(ctx context.Context) error {
postRan = true
postCtxErr = ctx.Err()
return nil
})
executor := newJobExecutor(jim, sfm, rc)
// The executor itself returns nil on timeout: the failure is surfaced through
// the job result ("failure", asserted via the result mock below), not the
// return value.
require.NoError(t, executor(ctx))
assert.True(t, postRan, "post step must run after a job timeout")
require.NoError(t, postCtxErr, "post step must run against a fresh, non-expired context")
jim.AssertExpectations(t)
sfm.AssertExpectations(t)
sm.AssertExpectations(t)
}
// TestSetJobResultMatrixContinueOnError exercises the parallel-matrix path
// end-to-end: two combinations share one *model.Job and continue-on-error is
// keyed on matrix.experimental, so one combination tolerates its failure and the
// other does not. The job is reported as continue-on-error only when EVERY failing
// combination was tolerated; a single firm failure makes the whole job firm, and
// handleFailure then fails the run.
func TestSetJobResultMatrixContinueOnError(t *testing.T) {
const jobYAML = "continue-on-error: ${{ matrix.experimental }}\nruns-on: ubuntu-latest"
newSharedJob := func(t *testing.T) (*model.Job, *model.Workflow) {
t.Helper()
var job *model.Job
require.NoError(t, yaml.Unmarshal([]byte(jobYAML), &job))
return job, &model.Workflow{
Name: "workflow1",
Jobs: map[string]*model.Job{"job1": job},
}
}
planFor := func(wf *model.Workflow) *model.Plan {
return &model.Plan{Stages: []*model.Stage{{Runs: []*model.Run{{Workflow: wf, JobID: "job1"}}}}}
}
ctx := context.Background()
// fail drives a single matrix combination through the failure path; each
// RunContext is its own jobInfo (rc implements jobInfo) and shares the job.
fail := func(wf *model.Workflow, experimental bool) {
rc := newTestRC(wf, map[string]any{"experimental": experimental})
setJobResult(ctx, rc, rc, false)
}
t.Run("one tolerated and one firm failure fails the run", func(t *testing.T) {
job, wf := newSharedJob(t)
// Order is intentional: the tolerated combination finishes first, then the
// firm one. The firm-failure latch must still win regardless of order.
fail(wf, true)
fail(wf, false)
assert.Equal(t, "failure", job.Result)
assert.False(t, job.ContinueOnError, "a single firm failure must make the whole job firm")
assert.Error(t, handleFailure(planFor(wf))(ctx))
})
t.Run("all tolerated failures do not fail the run", func(t *testing.T) {
job, wf := newSharedJob(t)
fail(wf, true)
fail(wf, true)
assert.Equal(t, "failure", job.Result)
assert.True(t, job.ContinueOnError, "every failing combination was tolerated")
assert.NoError(t, handleFailure(planFor(wf))(ctx))
})
}
func TestHasJobSummaryCapability(t *testing.T) {
assert.True(t, hasJobSummaryCapability("cache,job-summary artifacts"))
assert.True(t, hasJobSummaryCapability("cache,\njob-summary\tartifacts"))
@@ -674,3 +802,104 @@ func tarArchive(t *testing.T, entries ...tarEntry) []byte {
require.NoError(t, tw.Close())
return buf.Bytes()
}
func newTestRC(wf *model.Workflow, matrix map[string]any) *RunContext {
return &RunContext{
Config: &Config{
Workdir: ".",
Platforms: map[string]string{
"ubuntu-latest": "ubuntu-latest",
},
},
StepResults: map[string]*model.StepResult{},
Env: map[string]string{},
Matrix: matrix,
Run: &model.Run{JobID: "job1", Workflow: wf},
}
}
func makeTestRC(t *testing.T, jobYAML string) *RunContext {
t.Helper()
var job *model.Job
require.NoError(t, yaml.Unmarshal([]byte(jobYAML), &job))
rc := newTestRC(&model.Workflow{
Name: "workflow1",
Jobs: map[string]*model.Job{"job1": job},
}, nil)
rc.ExprEval = rc.NewExpressionEvaluator(context.Background())
return rc
}
func TestApplyJobTimeout(t *testing.T) {
cases := []struct {
name string
yaml string
wantTimeout bool
}{
{"empty", "runs-on: ubuntu-latest", false},
{"integer", "timeout-minutes: 5\nruns-on: ubuntu-latest", true},
{"non-numeric ignored", "timeout-minutes: abc\nruns-on: ubuntu-latest", false},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
rc := makeTestRC(t, tc.yaml)
ctx := context.Background()
newCtx, cancel := applyJobTimeout(ctx, rc, rc.Run.Job())
defer cancel()
_, hasDeadline := newCtx.Deadline()
assert.Equal(t, tc.wantTimeout, hasDeadline)
})
}
}
func TestEvaluateJobContinueOnError(t *testing.T) {
cases := []struct {
name string
yaml string
want bool
}{
{"absent", "runs-on: ubuntu-latest", false},
{"true", "continue-on-error: true\nruns-on: ubuntu-latest", true},
{"false", "continue-on-error: false\nruns-on: ubuntu-latest", false},
{"expression true", "continue-on-error: ${{ 'x' == 'x' }}\nruns-on: ubuntu-latest", true},
{"expression false", "continue-on-error: ${{ 'x' != 'x' }}\nruns-on: ubuntu-latest", false},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
rc := makeTestRC(t, tc.yaml)
got := evaluateJobContinueOnError(context.Background(), rc, rc.Run.Job())
assert.Equal(t, tc.want, got)
})
}
}
func TestJobSetContinueOnError(t *testing.T) {
t.Run("first call true", func(t *testing.T) {
j := &model.Job{}
j.SetContinueOnError(true)
assert.True(t, j.ContinueOnError)
})
t.Run("first call false", func(t *testing.T) {
j := &model.Job{}
j.SetContinueOnError(false)
assert.False(t, j.ContinueOnError)
})
t.Run("true then false locks to false", func(t *testing.T) {
j := &model.Job{}
j.SetContinueOnError(true)
j.SetContinueOnError(false)
assert.False(t, j.ContinueOnError)
})
t.Run("false then true stays false", func(t *testing.T) {
j := &model.Job{}
j.SetContinueOnError(false)
j.SetContinueOnError(true)
assert.False(t, j.ContinueOnError)
})
t.Run("true then true stays true", func(t *testing.T) {
j := &model.Job{}
j.SetContinueOnError(true)
j.SetContinueOnError(true)
assert.True(t, j.ContinueOnError)
})
}