feat: Enable jobs.<job_id>.timeout-minutes and jobs.<job_id>.continue-on-error (#1032)

Two `jobs.<job_id>` workflow syntax fields were parsed from YAML but silently ignored. This PR implements both:

- **`jobs.<job_id>.timeout-minutes`** — applies a context deadline around the entire job execution (container start, pre-steps, main steps, post-steps). Mirrors the existing step-level `evaluateStepTimeout`. Supports expression interpolation (e.g. `${{ env.MY_TIMEOUT }}`).

- **`jobs.<job_id>.continue-on-error`** — evaluates the expression when a job fails. If all failing matrix combinations had `continue-on-error: true`, the job does not cause the workflow run to fail (`handleFailure` skips it), and the tolerated failure reports `success` to dependent jobs through the `needs` context so jobs gated on the default `if: success()` still run (matching GitHub). The "any firm failure wins" rule is serialised under the existing per-job lock, so parallel matrix combinations are safe.

Both features follow the same patterns already used at the step level (`evaluateStepTimeout` / `isContinueOnError` in `act/runner/step.go`).

## Version compatibility

These changes are backward compatible. With mismatched versions the feature degrades silently to the previous behaviour (field ignored) — no errors on either side.

- `timeout-minutes`: runner-only, no server dependency.
- `continue-on-error`: requires both this runner PR and the matching Gitea server PR to take full effect. With only one side updated, the field continues to be ignored.

Related: [Github](https://github.com/go-gitea/gitea/pull/38100)
---------

Co-authored-by: silverwind <2021+silverwind@noreply.gitea.com>
Co-authored-by: silverwind <me@silverwind.io>
Reviewed-on: https://gitea.com/gitea/runner/pulls/1032
Reviewed-by: silverwind <2021+silverwind@noreply.gitea.com>
Reviewed-by: Zettat123 <39446+zettat123@noreply.gitea.com>
This commit is contained in:
Nicolas
2026-06-21 17:05:36 +00:00
parent 007717956a
commit 6bdcb54828
7 changed files with 355 additions and 24 deletions

View File

@@ -266,7 +266,7 @@ func (impl *interperterImpl) jobSuccess() (bool, error) { //nolint:unparam // pr
jobNeeds := impl.getNeedsTransitive(impl.config.Run.Job()) jobNeeds := impl.getNeedsTransitive(impl.config.Run.Job())
for _, needs := range jobNeeds { for _, needs := range jobNeeds {
if jobs[needs].Result != "success" { if jobs[needs].NeedsResult() != "success" {
return false, nil return false, nil
} }
} }
@@ -283,7 +283,7 @@ func (impl *interperterImpl) jobFailure() (bool, error) { //nolint:unparam // pr
jobNeeds := impl.getNeedsTransitive(impl.config.Run.Job()) jobNeeds := impl.getNeedsTransitive(impl.config.Run.Job())
for _, needs := range jobNeeds { for _, needs := range jobNeeds {
if jobs[needs].Result == "failure" { if jobs[needs].NeedsResult() == "failure" {
return true, nil return true, nil
} }
} }

View File

@@ -190,23 +190,52 @@ func (w *Workflow) WorkflowCallConfig() *WorkflowCall {
// Job is the structure of one job in a workflow // Job is the structure of one job in a workflow
type Job struct { type Job struct {
Name string `yaml:"name"` Name string `yaml:"name"`
RawNeeds yaml.Node `yaml:"needs"` RawNeeds yaml.Node `yaml:"needs"`
RawRunsOn yaml.Node `yaml:"runs-on"` RawRunsOn yaml.Node `yaml:"runs-on"`
Env yaml.Node `yaml:"env"` Env yaml.Node `yaml:"env"`
If yaml.Node `yaml:"if"` If yaml.Node `yaml:"if"`
Steps []*Step `yaml:"steps"` Steps []*Step `yaml:"steps"`
TimeoutMinutes string `yaml:"timeout-minutes"` TimeoutMinutes string `yaml:"timeout-minutes"`
Services map[string]*ContainerSpec `yaml:"services"` RawContinueOnError string `yaml:"continue-on-error"`
Strategy *Strategy `yaml:"strategy"` Services map[string]*ContainerSpec `yaml:"services"`
RawContainer yaml.Node `yaml:"container"` Strategy *Strategy `yaml:"strategy"`
Defaults Defaults `yaml:"defaults"` RawContainer yaml.Node `yaml:"container"`
Outputs map[string]string `yaml:"outputs"` Defaults Defaults `yaml:"defaults"`
Uses string `yaml:"uses"` Outputs map[string]string `yaml:"outputs"`
With map[string]any `yaml:"with"` Uses string `yaml:"uses"`
RawSecrets yaml.Node `yaml:"secrets"` With map[string]any `yaml:"with"`
RawPermissions yaml.Node `yaml:"permissions"` RawSecrets yaml.Node `yaml:"secrets"`
Result string RawPermissions yaml.Node `yaml:"permissions"`
Result string
// Runtime fields set during execution (not from YAML):
ContinueOnError bool // true when all failing matrix combinations had continue-on-error=true
hasFirmFailure bool // true once any combination failed without continue-on-error
}
// SetContinueOnError records whether this combination's failure should not fail the workflow.
// Must be called under the job lock. Safe across parallel matrix combinations.
func (j *Job) SetContinueOnError(continueOnErr bool) {
if continueOnErr {
if !j.hasFirmFailure {
j.ContinueOnError = true
}
} else {
j.hasFirmFailure = true
j.ContinueOnError = false
}
}
// NeedsResult returns the job result as seen by dependent jobs through the
// `needs` context. A job that failed but was tolerated via continue-on-error
// reports "success" to its dependents, matching GitHub: such a failure must not
// block jobs gated on the default `if: success()`, even though the overall
// workflow run is still marked as failed.
func (j *Job) NeedsResult() string {
if j.Result == "failure" && j.ContinueOnError {
return "success"
}
return j.Result
} }
// Strategy for the job // Strategy for the job

View File

@@ -32,6 +32,32 @@ func TestStepCloneIsolatesMutableFields(t *testing.T) {
assert.Equal(t, "original", orig.With["arg"], "With map must not be shared with the clone") assert.Equal(t, "original", orig.With["arg"], "With map must not be shared with the clone")
} }
// TestJobNeedsResult guards the continue-on-error semantics exposed to dependent
// jobs through the `needs` context: a failed-but-tolerated job reports "success"
// so it does not block dependents gated on the default `if: success()`, matching
// GitHub. A firm failure and any non-failure result are reported verbatim.
func TestJobNeedsResult(t *testing.T) {
cases := []struct {
name string
result string
continueOnError bool
want string
}{
{"tolerated failure reports success", "failure", true, "success"},
{"firm failure reports failure", "failure", false, "failure"},
{"success is unchanged", "success", false, "success"},
{"success with continue-on-error is unchanged", "success", true, "success"},
{"empty result is unchanged", "", true, ""},
{"skipped is unchanged", "skipped", true, "skipped"},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
j := &Job{Result: tc.result, ContinueOnError: tc.continueOnError}
assert.Equal(t, tc.want, j.NeedsResult())
})
}
}
func TestReadWorkflow_ScheduleEvent(t *testing.T) { func TestReadWorkflow_ScheduleEvent(t *testing.T) {
yaml := ` yaml := `
name: local-action-docker-url name: local-action-docker-url

View File

@@ -56,7 +56,7 @@ func (rc *RunContext) NewExpressionEvaluatorWithEnv(ctx context.Context, env map
for _, needs := range jobNeeds { for _, needs := range jobNeeds {
using[needs] = exprparser.Needs{ using[needs] = exprparser.Needs{
Outputs: jobs[needs].Outputs, Outputs: jobs[needs].Outputs,
Result: jobs[needs].Result, Result: jobs[needs].NeedsResult(),
} }
} }
@@ -127,7 +127,7 @@ func (rc *RunContext) NewStepExpressionEvaluator(ctx context.Context, step step)
for _, needs := range jobNeeds { for _, needs := range jobNeeds {
using[needs] = exprparser.Needs{ using[needs] = exprparser.Needs{
Outputs: jobs[needs].Outputs, Outputs: jobs[needs].Outputs,
Result: jobs[needs].Result, Result: jobs[needs].NeedsResult(),
} }
} }

View File

@@ -22,6 +22,7 @@ import (
"gitea.com/gitea/runner/act/common" "gitea.com/gitea/runner/act/common"
"gitea.com/gitea/runner/act/container" "gitea.com/gitea/runner/act/container"
"gitea.com/gitea/runner/act/exprparser"
"gitea.com/gitea/runner/act/model" "gitea.com/gitea/runner/act/model"
) )
@@ -204,11 +205,21 @@ func newJobExecutor(info jobInfo, sf stepFactory, rc *RunContext) common.Executo
return common.NewPipelineExecutor(info.startContainer(), common.NewPipelineExecutor(pipeline...). return common.NewPipelineExecutor(info.startContainer(), common.NewPipelineExecutor(pipeline...).
Finally(func(ctx context.Context) error { Finally(func(ctx context.Context) error {
var cancel context.CancelFunc var cancel context.CancelFunc
if ctx.Err() == context.Canceled { switch ctx.Err() {
case context.Canceled:
// in case of an aborted run, we still should execute the // in case of an aborted run, we still should execute the
// post steps to allow cleanup. // post steps to allow cleanup.
ctx, cancel = context.WithTimeout(common.WithLogger(context.Background(), common.Logger(ctx)), 5*time.Minute) ctx, cancel = context.WithTimeout(common.WithLogger(context.Background(), common.Logger(ctx)), 5*time.Minute)
defer cancel() defer cancel()
case context.DeadlineExceeded:
// The job hit its timeout-minutes. Without a fresh context the post
// steps would run against the already-expired context and be skipped,
// so cleanup post-hooks (e.g. actions/checkout post, cache save) would
// not run. Derive the context with WithoutCancel so the new deadline
// applies but the job error state is preserved: the job is still
// reported as failed and container teardown matches a normal failure.
ctx, cancel = context.WithTimeout(context.WithoutCancel(ctx), 5*time.Minute)
defer cancel()
} }
return postExecutor(ctx) return postExecutor(ctx)
}). }).
@@ -223,6 +234,12 @@ func setJobResult(ctx context.Context, info jobInfo, rc *RunContext, success boo
// read-modify-write of the job result so a failing combination is not lost-updated by a // read-modify-write of the job result so a failing combination is not lost-updated by a
// concurrent succeeding one. // concurrent succeeding one.
job := rc.Run.Job() job := rc.Run.Job()
var continueOnError bool
if !success {
// Use a fresh context so an expired job timeout cannot block expression evaluation.
evalCtx := common.WithLogger(context.Background(), common.Logger(ctx))
continueOnError = evaluateJobContinueOnError(evalCtx, rc, job)
}
jobResult := func() string { jobResult := func() string {
defer lockJob(job)() defer lockJob(job)()
result := "success" result := "success"
@@ -233,6 +250,7 @@ func setJobResult(ctx context.Context, info jobInfo, rc *RunContext, success boo
} }
if !success { if !success {
result = "failure" result = "failure"
job.SetContinueOnError(continueOnError)
} }
info.result(result) info.result(result)
return result return result
@@ -271,6 +289,32 @@ func setJobOutputs(ctx context.Context, rc *RunContext) {
} }
} }
// applyJobTimeout applies the job-level timeout-minutes to ctx, mirroring the
// step-level evaluateStepTimeout in step.go.
func applyJobTimeout(ctx context.Context, rc *RunContext, job *model.Job) (context.Context, context.CancelFunc) {
timeout := rc.ExprEval.Interpolate(ctx, job.TimeoutMinutes)
if timeout != "" {
if timeoutMinutes, err := strconv.ParseInt(timeout, 10, 64); err == nil {
return context.WithTimeout(ctx, time.Duration(timeoutMinutes)*time.Minute)
}
}
return ctx, func() {}
}
// evaluateJobContinueOnError evaluates the job-level continue-on-error expression.
func evaluateJobContinueOnError(ctx context.Context, rc *RunContext, job *model.Job) bool {
expr := strings.TrimSpace(job.RawContinueOnError)
if expr == "" {
return false
}
continueOnError, err := EvalBool(ctx, rc.NewExpressionEvaluator(ctx), expr, exprparser.DefaultStatusCheckNone)
if err != nil {
common.Logger(ctx).Warnf("continue-on-error expression %q evaluation failed: %v", expr, err)
return false
}
return continueOnError
}
func tryUploadJobSummary(ctx context.Context, rc *RunContext) { func tryUploadJobSummary(ctx context.Context, rc *RunContext) {
if rc == nil || rc.JobContainer == nil || rc.Config == nil { if rc == nil || rc.JobContainer == nil || rc.Config == nil {
return return

View File

@@ -28,6 +28,7 @@ import (
"github.com/stretchr/testify/assert" "github.com/stretchr/testify/assert"
"github.com/stretchr/testify/mock" "github.com/stretchr/testify/mock"
"github.com/stretchr/testify/require" "github.com/stretchr/testify/require"
yaml "go.yaml.in/yaml/v4"
) )
func TestJobExecutor(t *testing.T) { func TestJobExecutor(t *testing.T) {
@@ -347,6 +348,133 @@ func TestNewJobExecutor(t *testing.T) {
} }
} }
// TestNewJobExecutorRunsPostStepsAfterTimeout guards the timeout-minutes cleanup
// path: when a job exceeds its timeout the job context is DeadlineExceeded, but
// the post steps (cleanup hooks like actions/checkout post and cache save) must
// still run against a fresh, non-expired context, and the job must still be
// reported as failed.
func TestNewJobExecutorRunsPostStepsAfterTimeout(t *testing.T) {
ctx := common.WithJobErrorContainer(context.Background())
// The timeout is generous so the main step (which blocks on ctx.Done below) is
// always reached before the deadline fires; otherwise the pipeline would
// short-circuit before the step runs and the job error would never be set.
ctx, cancel := context.WithTimeout(ctx, 200*time.Millisecond)
defer cancel()
jim := &jobInfoMock{}
sfm := &stepFactoryMock{}
rc := &RunContext{
JobContainer: &jobContainerMock{},
Run: &model.Run{
JobID: "test",
Workflow: &model.Workflow{
Jobs: map[string]*model.Job{
"test": {},
},
},
},
Config: &Config{},
}
rc.ExprEval = rc.NewExpressionEvaluator(ctx)
stepModel := &model.Step{ID: "1"}
jim.On("steps").Return([]*model.Step{stepModel})
jim.On("matrix").Return(map[string]any{})
jim.On("startContainer").Return(func(ctx context.Context) error { return nil })
jim.On("interpolateOutputs").Return(func(ctx context.Context) error { return nil })
jim.On("closeContainer").Return(func(ctx context.Context) error { return nil })
// The job timed out, so it must be reported as failed. stopContainer is left
// unexpected on purpose: a timed-out (failed) job preserves its error state, so
// the graceful stop is skipped exactly like any other failure without AutoRemove.
jim.On("result", "failure")
sm := &stepMock{}
sfm.On("newStep", stepModel, rc).Return(sm, nil)
sm.On("pre").Return(func(ctx context.Context) error { return nil })
// The main step runs past the job timeout: it blocks until the job context is
// done, mirroring a step that overruns timeout-minutes.
sm.On("main").Return(func(ctx context.Context) error {
<-ctx.Done()
return ctx.Err()
})
var postRan bool
var postCtxErr error
sm.On("post").Return(func(ctx context.Context) error {
postRan = true
postCtxErr = ctx.Err()
return nil
})
executor := newJobExecutor(jim, sfm, rc)
// The executor itself returns nil on timeout: the failure is surfaced through
// the job result ("failure", asserted via the result mock below), not the
// return value.
require.NoError(t, executor(ctx))
assert.True(t, postRan, "post step must run after a job timeout")
require.NoError(t, postCtxErr, "post step must run against a fresh, non-expired context")
jim.AssertExpectations(t)
sfm.AssertExpectations(t)
sm.AssertExpectations(t)
}
// TestSetJobResultMatrixContinueOnError exercises the parallel-matrix path
// end-to-end: two combinations share one *model.Job and continue-on-error is
// keyed on matrix.experimental, so one combination tolerates its failure and the
// other does not. The job is reported as continue-on-error only when EVERY failing
// combination was tolerated; a single firm failure makes the whole job firm, and
// handleFailure then fails the run.
func TestSetJobResultMatrixContinueOnError(t *testing.T) {
const jobYAML = "continue-on-error: ${{ matrix.experimental }}\nruns-on: ubuntu-latest"
newSharedJob := func(t *testing.T) (*model.Job, *model.Workflow) {
t.Helper()
var job *model.Job
require.NoError(t, yaml.Unmarshal([]byte(jobYAML), &job))
return job, &model.Workflow{
Name: "workflow1",
Jobs: map[string]*model.Job{"job1": job},
}
}
planFor := func(wf *model.Workflow) *model.Plan {
return &model.Plan{Stages: []*model.Stage{{Runs: []*model.Run{{Workflow: wf, JobID: "job1"}}}}}
}
ctx := context.Background()
// fail drives a single matrix combination through the failure path; each
// RunContext is its own jobInfo (rc implements jobInfo) and shares the job.
fail := func(wf *model.Workflow, experimental bool) {
rc := newTestRC(wf, map[string]any{"experimental": experimental})
setJobResult(ctx, rc, rc, false)
}
t.Run("one tolerated and one firm failure fails the run", func(t *testing.T) {
job, wf := newSharedJob(t)
// Order is intentional: the tolerated combination finishes first, then the
// firm one. The firm-failure latch must still win regardless of order.
fail(wf, true)
fail(wf, false)
assert.Equal(t, "failure", job.Result)
assert.False(t, job.ContinueOnError, "a single firm failure must make the whole job firm")
assert.Error(t, handleFailure(planFor(wf))(ctx))
})
t.Run("all tolerated failures do not fail the run", func(t *testing.T) {
job, wf := newSharedJob(t)
fail(wf, true)
fail(wf, true)
assert.Equal(t, "failure", job.Result)
assert.True(t, job.ContinueOnError, "every failing combination was tolerated")
assert.NoError(t, handleFailure(planFor(wf))(ctx))
})
}
func TestHasJobSummaryCapability(t *testing.T) { func TestHasJobSummaryCapability(t *testing.T) {
assert.True(t, hasJobSummaryCapability("cache,job-summary artifacts")) assert.True(t, hasJobSummaryCapability("cache,job-summary artifacts"))
assert.True(t, hasJobSummaryCapability("cache,\njob-summary\tartifacts")) assert.True(t, hasJobSummaryCapability("cache,\njob-summary\tartifacts"))
@@ -674,3 +802,104 @@ func tarArchive(t *testing.T, entries ...tarEntry) []byte {
require.NoError(t, tw.Close()) require.NoError(t, tw.Close())
return buf.Bytes() return buf.Bytes()
} }
func newTestRC(wf *model.Workflow, matrix map[string]any) *RunContext {
return &RunContext{
Config: &Config{
Workdir: ".",
Platforms: map[string]string{
"ubuntu-latest": "ubuntu-latest",
},
},
StepResults: map[string]*model.StepResult{},
Env: map[string]string{},
Matrix: matrix,
Run: &model.Run{JobID: "job1", Workflow: wf},
}
}
func makeTestRC(t *testing.T, jobYAML string) *RunContext {
t.Helper()
var job *model.Job
require.NoError(t, yaml.Unmarshal([]byte(jobYAML), &job))
rc := newTestRC(&model.Workflow{
Name: "workflow1",
Jobs: map[string]*model.Job{"job1": job},
}, nil)
rc.ExprEval = rc.NewExpressionEvaluator(context.Background())
return rc
}
func TestApplyJobTimeout(t *testing.T) {
cases := []struct {
name string
yaml string
wantTimeout bool
}{
{"empty", "runs-on: ubuntu-latest", false},
{"integer", "timeout-minutes: 5\nruns-on: ubuntu-latest", true},
{"non-numeric ignored", "timeout-minutes: abc\nruns-on: ubuntu-latest", false},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
rc := makeTestRC(t, tc.yaml)
ctx := context.Background()
newCtx, cancel := applyJobTimeout(ctx, rc, rc.Run.Job())
defer cancel()
_, hasDeadline := newCtx.Deadline()
assert.Equal(t, tc.wantTimeout, hasDeadline)
})
}
}
func TestEvaluateJobContinueOnError(t *testing.T) {
cases := []struct {
name string
yaml string
want bool
}{
{"absent", "runs-on: ubuntu-latest", false},
{"true", "continue-on-error: true\nruns-on: ubuntu-latest", true},
{"false", "continue-on-error: false\nruns-on: ubuntu-latest", false},
{"expression true", "continue-on-error: ${{ 'x' == 'x' }}\nruns-on: ubuntu-latest", true},
{"expression false", "continue-on-error: ${{ 'x' != 'x' }}\nruns-on: ubuntu-latest", false},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
rc := makeTestRC(t, tc.yaml)
got := evaluateJobContinueOnError(context.Background(), rc, rc.Run.Job())
assert.Equal(t, tc.want, got)
})
}
}
func TestJobSetContinueOnError(t *testing.T) {
t.Run("first call true", func(t *testing.T) {
j := &model.Job{}
j.SetContinueOnError(true)
assert.True(t, j.ContinueOnError)
})
t.Run("first call false", func(t *testing.T) {
j := &model.Job{}
j.SetContinueOnError(false)
assert.False(t, j.ContinueOnError)
})
t.Run("true then false locks to false", func(t *testing.T) {
j := &model.Job{}
j.SetContinueOnError(true)
j.SetContinueOnError(false)
assert.False(t, j.ContinueOnError)
})
t.Run("false then true stays false", func(t *testing.T) {
j := &model.Job{}
j.SetContinueOnError(false)
j.SetContinueOnError(true)
assert.False(t, j.ContinueOnError)
})
t.Run("true then true stays true", func(t *testing.T) {
j := &model.Job{}
j.SetContinueOnError(true)
j.SetContinueOnError(true)
assert.True(t, j.ContinueOnError)
})
}

View File

@@ -250,7 +250,10 @@ func (runner *runnerImpl) NewPlanExecutor(plan *model.Plan) common.Executor {
return err return err
} }
return executor(common.WithJobErrorContainer(WithJobLogger(ctx, rc.Run.JobID, jobName, rc.Config, &rc.Masks, matrix))) jobCtx := common.WithJobErrorContainer(WithJobLogger(ctx, rc.Run.JobID, jobName, rc.Config, &rc.Masks, matrix))
jobCtx, cancelTimeout := applyJobTimeout(jobCtx, rc, job)
defer cancelTimeout()
return executor(jobCtx)
}) })
} }
// Run all matrix combinations of this job, then drop its aggregation mutex: the // Run all matrix combinations of this job, then drop its aggregation mutex: the
@@ -305,7 +308,7 @@ func handleFailure(plan *model.Plan) common.Executor {
return func(ctx context.Context) error { return func(ctx context.Context) error {
for _, stage := range plan.Stages { for _, stage := range plan.Stages {
for _, run := range stage.Runs { for _, run := range stage.Runs {
if run.Job().Result == "failure" { if run.Job().Result == "failure" && !run.Job().ContinueOnError {
return fmt.Errorf("Job '%s' failed", run.String()) return fmt.Errorf("Job '%s' failed", run.String())
} }
} }