From 6bdcb548282d464eed228183b9e3e6bbb1ab6e27 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sun, 21 Jun 2026 17:05:36 +0000 Subject: [PATCH] feat: Enable `jobs..timeout-minutes` and `jobs..continue-on-error` (#1032) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two `jobs.` workflow syntax fields were parsed from YAML but silently ignored. This PR implements both: - **`jobs..timeout-minutes`** — applies a context deadline around the entire job execution (container start, pre-steps, main steps, post-steps). Mirrors the existing step-level `evaluateStepTimeout`. Supports expression interpolation (e.g. `${{ env.MY_TIMEOUT }}`). - **`jobs..continue-on-error`** — evaluates the expression when a job fails. If all failing matrix combinations had `continue-on-error: true`, the job does not cause the workflow run to fail (`handleFailure` skips it), and the tolerated failure reports `success` to dependent jobs through the `needs` context so jobs gated on the default `if: success()` still run (matching GitHub). The "any firm failure wins" rule is serialised under the existing per-job lock, so parallel matrix combinations are safe. Both features follow the same patterns already used at the step level (`evaluateStepTimeout` / `isContinueOnError` in `act/runner/step.go`). ## Version compatibility These changes are backward compatible. With mismatched versions the feature degrades silently to the previous behaviour (field ignored) — no errors on either side. - `timeout-minutes`: runner-only, no server dependency. - `continue-on-error`: requires both this runner PR and the matching Gitea server PR to take full effect. With only one side updated, the field continues to be ignored. Related: [Github](https://github.com/go-gitea/gitea/pull/38100) --------- Co-authored-by: silverwind <2021+silverwind@noreply.gitea.com> Co-authored-by: silverwind Reviewed-on: https://gitea.com/gitea/runner/pulls/1032 Reviewed-by: silverwind <2021+silverwind@noreply.gitea.com> Reviewed-by: Zettat123 <39446+zettat123@noreply.gitea.com> --- act/exprparser/functions.go | 4 +- act/model/workflow.go | 63 ++++++--- act/model/workflow_test.go | 26 ++++ act/runner/expression.go | 4 +- act/runner/job_executor.go | 46 ++++++- act/runner/job_executor_test.go | 229 ++++++++++++++++++++++++++++++++ act/runner/runner.go | 7 +- 7 files changed, 355 insertions(+), 24 deletions(-) diff --git a/act/exprparser/functions.go b/act/exprparser/functions.go index 4243abe2..294227c4 100644 --- a/act/exprparser/functions.go +++ b/act/exprparser/functions.go @@ -266,7 +266,7 @@ func (impl *interperterImpl) jobSuccess() (bool, error) { //nolint:unparam // pr jobNeeds := impl.getNeedsTransitive(impl.config.Run.Job()) for _, needs := range jobNeeds { - if jobs[needs].Result != "success" { + if jobs[needs].NeedsResult() != "success" { return false, nil } } @@ -283,7 +283,7 @@ func (impl *interperterImpl) jobFailure() (bool, error) { //nolint:unparam // pr jobNeeds := impl.getNeedsTransitive(impl.config.Run.Job()) for _, needs := range jobNeeds { - if jobs[needs].Result == "failure" { + if jobs[needs].NeedsResult() == "failure" { return true, nil } } diff --git a/act/model/workflow.go b/act/model/workflow.go index 8327c800..01ba049f 100644 --- a/act/model/workflow.go +++ b/act/model/workflow.go @@ -190,23 +190,52 @@ func (w *Workflow) WorkflowCallConfig() *WorkflowCall { // Job is the structure of one job in a workflow type Job struct { - Name string `yaml:"name"` - RawNeeds yaml.Node `yaml:"needs"` - RawRunsOn yaml.Node `yaml:"runs-on"` - Env yaml.Node `yaml:"env"` - If yaml.Node `yaml:"if"` - Steps []*Step `yaml:"steps"` - TimeoutMinutes string `yaml:"timeout-minutes"` - Services map[string]*ContainerSpec `yaml:"services"` - Strategy *Strategy `yaml:"strategy"` - RawContainer yaml.Node `yaml:"container"` - Defaults Defaults `yaml:"defaults"` - Outputs map[string]string `yaml:"outputs"` - Uses string `yaml:"uses"` - With map[string]any `yaml:"with"` - RawSecrets yaml.Node `yaml:"secrets"` - RawPermissions yaml.Node `yaml:"permissions"` - Result string + Name string `yaml:"name"` + RawNeeds yaml.Node `yaml:"needs"` + RawRunsOn yaml.Node `yaml:"runs-on"` + Env yaml.Node `yaml:"env"` + If yaml.Node `yaml:"if"` + Steps []*Step `yaml:"steps"` + TimeoutMinutes string `yaml:"timeout-minutes"` + RawContinueOnError string `yaml:"continue-on-error"` + Services map[string]*ContainerSpec `yaml:"services"` + Strategy *Strategy `yaml:"strategy"` + RawContainer yaml.Node `yaml:"container"` + Defaults Defaults `yaml:"defaults"` + Outputs map[string]string `yaml:"outputs"` + Uses string `yaml:"uses"` + With map[string]any `yaml:"with"` + RawSecrets yaml.Node `yaml:"secrets"` + RawPermissions yaml.Node `yaml:"permissions"` + Result string + // Runtime fields set during execution (not from YAML): + ContinueOnError bool // true when all failing matrix combinations had continue-on-error=true + hasFirmFailure bool // true once any combination failed without continue-on-error +} + +// SetContinueOnError records whether this combination's failure should not fail the workflow. +// Must be called under the job lock. Safe across parallel matrix combinations. +func (j *Job) SetContinueOnError(continueOnErr bool) { + if continueOnErr { + if !j.hasFirmFailure { + j.ContinueOnError = true + } + } else { + j.hasFirmFailure = true + j.ContinueOnError = false + } +} + +// NeedsResult returns the job result as seen by dependent jobs through the +// `needs` context. A job that failed but was tolerated via continue-on-error +// reports "success" to its dependents, matching GitHub: such a failure must not +// block jobs gated on the default `if: success()`, even though the overall +// workflow run is still marked as failed. +func (j *Job) NeedsResult() string { + if j.Result == "failure" && j.ContinueOnError { + return "success" + } + return j.Result } // Strategy for the job diff --git a/act/model/workflow_test.go b/act/model/workflow_test.go index f08f8b3b..3e66d197 100644 --- a/act/model/workflow_test.go +++ b/act/model/workflow_test.go @@ -32,6 +32,32 @@ func TestStepCloneIsolatesMutableFields(t *testing.T) { assert.Equal(t, "original", orig.With["arg"], "With map must not be shared with the clone") } +// TestJobNeedsResult guards the continue-on-error semantics exposed to dependent +// jobs through the `needs` context: a failed-but-tolerated job reports "success" +// so it does not block dependents gated on the default `if: success()`, matching +// GitHub. A firm failure and any non-failure result are reported verbatim. +func TestJobNeedsResult(t *testing.T) { + cases := []struct { + name string + result string + continueOnError bool + want string + }{ + {"tolerated failure reports success", "failure", true, "success"}, + {"firm failure reports failure", "failure", false, "failure"}, + {"success is unchanged", "success", false, "success"}, + {"success with continue-on-error is unchanged", "success", true, "success"}, + {"empty result is unchanged", "", true, ""}, + {"skipped is unchanged", "skipped", true, "skipped"}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + j := &Job{Result: tc.result, ContinueOnError: tc.continueOnError} + assert.Equal(t, tc.want, j.NeedsResult()) + }) + } +} + func TestReadWorkflow_ScheduleEvent(t *testing.T) { yaml := ` name: local-action-docker-url diff --git a/act/runner/expression.go b/act/runner/expression.go index ba935daf..c64f921b 100644 --- a/act/runner/expression.go +++ b/act/runner/expression.go @@ -56,7 +56,7 @@ func (rc *RunContext) NewExpressionEvaluatorWithEnv(ctx context.Context, env map for _, needs := range jobNeeds { using[needs] = exprparser.Needs{ Outputs: jobs[needs].Outputs, - Result: jobs[needs].Result, + Result: jobs[needs].NeedsResult(), } } @@ -127,7 +127,7 @@ func (rc *RunContext) NewStepExpressionEvaluator(ctx context.Context, step step) for _, needs := range jobNeeds { using[needs] = exprparser.Needs{ Outputs: jobs[needs].Outputs, - Result: jobs[needs].Result, + Result: jobs[needs].NeedsResult(), } } diff --git a/act/runner/job_executor.go b/act/runner/job_executor.go index b8f7ccfd..752f74a7 100644 --- a/act/runner/job_executor.go +++ b/act/runner/job_executor.go @@ -22,6 +22,7 @@ import ( "gitea.com/gitea/runner/act/common" "gitea.com/gitea/runner/act/container" + "gitea.com/gitea/runner/act/exprparser" "gitea.com/gitea/runner/act/model" ) @@ -204,11 +205,21 @@ func newJobExecutor(info jobInfo, sf stepFactory, rc *RunContext) common.Executo return common.NewPipelineExecutor(info.startContainer(), common.NewPipelineExecutor(pipeline...). Finally(func(ctx context.Context) error { var cancel context.CancelFunc - if ctx.Err() == context.Canceled { + switch ctx.Err() { + case context.Canceled: // in case of an aborted run, we still should execute the // post steps to allow cleanup. ctx, cancel = context.WithTimeout(common.WithLogger(context.Background(), common.Logger(ctx)), 5*time.Minute) defer cancel() + case context.DeadlineExceeded: + // The job hit its timeout-minutes. Without a fresh context the post + // steps would run against the already-expired context and be skipped, + // so cleanup post-hooks (e.g. actions/checkout post, cache save) would + // not run. Derive the context with WithoutCancel so the new deadline + // applies but the job error state is preserved: the job is still + // reported as failed and container teardown matches a normal failure. + ctx, cancel = context.WithTimeout(context.WithoutCancel(ctx), 5*time.Minute) + defer cancel() } return postExecutor(ctx) }). @@ -223,6 +234,12 @@ func setJobResult(ctx context.Context, info jobInfo, rc *RunContext, success boo // read-modify-write of the job result so a failing combination is not lost-updated by a // concurrent succeeding one. job := rc.Run.Job() + var continueOnError bool + if !success { + // Use a fresh context so an expired job timeout cannot block expression evaluation. + evalCtx := common.WithLogger(context.Background(), common.Logger(ctx)) + continueOnError = evaluateJobContinueOnError(evalCtx, rc, job) + } jobResult := func() string { defer lockJob(job)() result := "success" @@ -233,6 +250,7 @@ func setJobResult(ctx context.Context, info jobInfo, rc *RunContext, success boo } if !success { result = "failure" + job.SetContinueOnError(continueOnError) } info.result(result) return result @@ -271,6 +289,32 @@ func setJobOutputs(ctx context.Context, rc *RunContext) { } } +// applyJobTimeout applies the job-level timeout-minutes to ctx, mirroring the +// step-level evaluateStepTimeout in step.go. +func applyJobTimeout(ctx context.Context, rc *RunContext, job *model.Job) (context.Context, context.CancelFunc) { + timeout := rc.ExprEval.Interpolate(ctx, job.TimeoutMinutes) + if timeout != "" { + if timeoutMinutes, err := strconv.ParseInt(timeout, 10, 64); err == nil { + return context.WithTimeout(ctx, time.Duration(timeoutMinutes)*time.Minute) + } + } + return ctx, func() {} +} + +// evaluateJobContinueOnError evaluates the job-level continue-on-error expression. +func evaluateJobContinueOnError(ctx context.Context, rc *RunContext, job *model.Job) bool { + expr := strings.TrimSpace(job.RawContinueOnError) + if expr == "" { + return false + } + continueOnError, err := EvalBool(ctx, rc.NewExpressionEvaluator(ctx), expr, exprparser.DefaultStatusCheckNone) + if err != nil { + common.Logger(ctx).Warnf("continue-on-error expression %q evaluation failed: %v", expr, err) + return false + } + return continueOnError +} + func tryUploadJobSummary(ctx context.Context, rc *RunContext) { if rc == nil || rc.JobContainer == nil || rc.Config == nil { return diff --git a/act/runner/job_executor_test.go b/act/runner/job_executor_test.go index 55228e85..4c1cf7f0 100644 --- a/act/runner/job_executor_test.go +++ b/act/runner/job_executor_test.go @@ -28,6 +28,7 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/mock" "github.com/stretchr/testify/require" + yaml "go.yaml.in/yaml/v4" ) func TestJobExecutor(t *testing.T) { @@ -347,6 +348,133 @@ func TestNewJobExecutor(t *testing.T) { } } +// TestNewJobExecutorRunsPostStepsAfterTimeout guards the timeout-minutes cleanup +// path: when a job exceeds its timeout the job context is DeadlineExceeded, but +// the post steps (cleanup hooks like actions/checkout post and cache save) must +// still run against a fresh, non-expired context, and the job must still be +// reported as failed. +func TestNewJobExecutorRunsPostStepsAfterTimeout(t *testing.T) { + ctx := common.WithJobErrorContainer(context.Background()) + // The timeout is generous so the main step (which blocks on ctx.Done below) is + // always reached before the deadline fires; otherwise the pipeline would + // short-circuit before the step runs and the job error would never be set. + ctx, cancel := context.WithTimeout(ctx, 200*time.Millisecond) + defer cancel() + + jim := &jobInfoMock{} + sfm := &stepFactoryMock{} + rc := &RunContext{ + JobContainer: &jobContainerMock{}, + Run: &model.Run{ + JobID: "test", + Workflow: &model.Workflow{ + Jobs: map[string]*model.Job{ + "test": {}, + }, + }, + }, + Config: &Config{}, + } + rc.ExprEval = rc.NewExpressionEvaluator(ctx) + + stepModel := &model.Step{ID: "1"} + jim.On("steps").Return([]*model.Step{stepModel}) + jim.On("matrix").Return(map[string]any{}) + jim.On("startContainer").Return(func(ctx context.Context) error { return nil }) + jim.On("interpolateOutputs").Return(func(ctx context.Context) error { return nil }) + jim.On("closeContainer").Return(func(ctx context.Context) error { return nil }) + // The job timed out, so it must be reported as failed. stopContainer is left + // unexpected on purpose: a timed-out (failed) job preserves its error state, so + // the graceful stop is skipped exactly like any other failure without AutoRemove. + jim.On("result", "failure") + + sm := &stepMock{} + sfm.On("newStep", stepModel, rc).Return(sm, nil) + sm.On("pre").Return(func(ctx context.Context) error { return nil }) + // The main step runs past the job timeout: it blocks until the job context is + // done, mirroring a step that overruns timeout-minutes. + sm.On("main").Return(func(ctx context.Context) error { + <-ctx.Done() + return ctx.Err() + }) + + var postRan bool + var postCtxErr error + sm.On("post").Return(func(ctx context.Context) error { + postRan = true + postCtxErr = ctx.Err() + return nil + }) + + executor := newJobExecutor(jim, sfm, rc) + // The executor itself returns nil on timeout: the failure is surfaced through + // the job result ("failure", asserted via the result mock below), not the + // return value. + require.NoError(t, executor(ctx)) + + assert.True(t, postRan, "post step must run after a job timeout") + require.NoError(t, postCtxErr, "post step must run against a fresh, non-expired context") + + jim.AssertExpectations(t) + sfm.AssertExpectations(t) + sm.AssertExpectations(t) +} + +// TestSetJobResultMatrixContinueOnError exercises the parallel-matrix path +// end-to-end: two combinations share one *model.Job and continue-on-error is +// keyed on matrix.experimental, so one combination tolerates its failure and the +// other does not. The job is reported as continue-on-error only when EVERY failing +// combination was tolerated; a single firm failure makes the whole job firm, and +// handleFailure then fails the run. +func TestSetJobResultMatrixContinueOnError(t *testing.T) { + const jobYAML = "continue-on-error: ${{ matrix.experimental }}\nruns-on: ubuntu-latest" + + newSharedJob := func(t *testing.T) (*model.Job, *model.Workflow) { + t.Helper() + var job *model.Job + require.NoError(t, yaml.Unmarshal([]byte(jobYAML), &job)) + return job, &model.Workflow{ + Name: "workflow1", + Jobs: map[string]*model.Job{"job1": job}, + } + } + + planFor := func(wf *model.Workflow) *model.Plan { + return &model.Plan{Stages: []*model.Stage{{Runs: []*model.Run{{Workflow: wf, JobID: "job1"}}}}} + } + + ctx := context.Background() + + // fail drives a single matrix combination through the failure path; each + // RunContext is its own jobInfo (rc implements jobInfo) and shares the job. + fail := func(wf *model.Workflow, experimental bool) { + rc := newTestRC(wf, map[string]any{"experimental": experimental}) + setJobResult(ctx, rc, rc, false) + } + + t.Run("one tolerated and one firm failure fails the run", func(t *testing.T) { + job, wf := newSharedJob(t) + // Order is intentional: the tolerated combination finishes first, then the + // firm one. The firm-failure latch must still win regardless of order. + fail(wf, true) + fail(wf, false) + + assert.Equal(t, "failure", job.Result) + assert.False(t, job.ContinueOnError, "a single firm failure must make the whole job firm") + assert.Error(t, handleFailure(planFor(wf))(ctx)) + }) + + t.Run("all tolerated failures do not fail the run", func(t *testing.T) { + job, wf := newSharedJob(t) + fail(wf, true) + fail(wf, true) + + assert.Equal(t, "failure", job.Result) + assert.True(t, job.ContinueOnError, "every failing combination was tolerated") + assert.NoError(t, handleFailure(planFor(wf))(ctx)) + }) +} + func TestHasJobSummaryCapability(t *testing.T) { assert.True(t, hasJobSummaryCapability("cache,job-summary artifacts")) assert.True(t, hasJobSummaryCapability("cache,\njob-summary\tartifacts")) @@ -674,3 +802,104 @@ func tarArchive(t *testing.T, entries ...tarEntry) []byte { require.NoError(t, tw.Close()) return buf.Bytes() } + +func newTestRC(wf *model.Workflow, matrix map[string]any) *RunContext { + return &RunContext{ + Config: &Config{ + Workdir: ".", + Platforms: map[string]string{ + "ubuntu-latest": "ubuntu-latest", + }, + }, + StepResults: map[string]*model.StepResult{}, + Env: map[string]string{}, + Matrix: matrix, + Run: &model.Run{JobID: "job1", Workflow: wf}, + } +} + +func makeTestRC(t *testing.T, jobYAML string) *RunContext { + t.Helper() + var job *model.Job + require.NoError(t, yaml.Unmarshal([]byte(jobYAML), &job)) + rc := newTestRC(&model.Workflow{ + Name: "workflow1", + Jobs: map[string]*model.Job{"job1": job}, + }, nil) + rc.ExprEval = rc.NewExpressionEvaluator(context.Background()) + return rc +} + +func TestApplyJobTimeout(t *testing.T) { + cases := []struct { + name string + yaml string + wantTimeout bool + }{ + {"empty", "runs-on: ubuntu-latest", false}, + {"integer", "timeout-minutes: 5\nruns-on: ubuntu-latest", true}, + {"non-numeric ignored", "timeout-minutes: abc\nruns-on: ubuntu-latest", false}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + rc := makeTestRC(t, tc.yaml) + ctx := context.Background() + newCtx, cancel := applyJobTimeout(ctx, rc, rc.Run.Job()) + defer cancel() + _, hasDeadline := newCtx.Deadline() + assert.Equal(t, tc.wantTimeout, hasDeadline) + }) + } +} + +func TestEvaluateJobContinueOnError(t *testing.T) { + cases := []struct { + name string + yaml string + want bool + }{ + {"absent", "runs-on: ubuntu-latest", false}, + {"true", "continue-on-error: true\nruns-on: ubuntu-latest", true}, + {"false", "continue-on-error: false\nruns-on: ubuntu-latest", false}, + {"expression true", "continue-on-error: ${{ 'x' == 'x' }}\nruns-on: ubuntu-latest", true}, + {"expression false", "continue-on-error: ${{ 'x' != 'x' }}\nruns-on: ubuntu-latest", false}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + rc := makeTestRC(t, tc.yaml) + got := evaluateJobContinueOnError(context.Background(), rc, rc.Run.Job()) + assert.Equal(t, tc.want, got) + }) + } +} + +func TestJobSetContinueOnError(t *testing.T) { + t.Run("first call true", func(t *testing.T) { + j := &model.Job{} + j.SetContinueOnError(true) + assert.True(t, j.ContinueOnError) + }) + t.Run("first call false", func(t *testing.T) { + j := &model.Job{} + j.SetContinueOnError(false) + assert.False(t, j.ContinueOnError) + }) + t.Run("true then false locks to false", func(t *testing.T) { + j := &model.Job{} + j.SetContinueOnError(true) + j.SetContinueOnError(false) + assert.False(t, j.ContinueOnError) + }) + t.Run("false then true stays false", func(t *testing.T) { + j := &model.Job{} + j.SetContinueOnError(false) + j.SetContinueOnError(true) + assert.False(t, j.ContinueOnError) + }) + t.Run("true then true stays true", func(t *testing.T) { + j := &model.Job{} + j.SetContinueOnError(true) + j.SetContinueOnError(true) + assert.True(t, j.ContinueOnError) + }) +} diff --git a/act/runner/runner.go b/act/runner/runner.go index 4deb683e..ff57c178 100644 --- a/act/runner/runner.go +++ b/act/runner/runner.go @@ -250,7 +250,10 @@ func (runner *runnerImpl) NewPlanExecutor(plan *model.Plan) common.Executor { return err } - return executor(common.WithJobErrorContainer(WithJobLogger(ctx, rc.Run.JobID, jobName, rc.Config, &rc.Masks, matrix))) + jobCtx := common.WithJobErrorContainer(WithJobLogger(ctx, rc.Run.JobID, jobName, rc.Config, &rc.Masks, matrix)) + jobCtx, cancelTimeout := applyJobTimeout(jobCtx, rc, job) + defer cancelTimeout() + return executor(jobCtx) }) } // Run all matrix combinations of this job, then drop its aggregation mutex: the @@ -305,7 +308,7 @@ func handleFailure(plan *model.Plan) common.Executor { return func(ctx context.Context) error { for _, stage := range plan.Stages { for _, run := range stage.Runs { - if run.Job().Result == "failure" { + if run.Job().Result == "failure" && !run.Job().ContinueOnError { return fmt.Errorf("Job '%s' failed", run.String()) } }