feat: Enable jobs.<job_id>.timeout-minutes and jobs.<job_id>.continue-on-error (#1032)

Two `jobs.<job_id>` workflow syntax fields were parsed from YAML but silently ignored. This PR implements both: - **`jobs.<job_id>.timeout-minutes`** — applies a context deadline around the entire job execution (container start, pre-steps, main steps, post-steps). Mirrors the existing step-level `evaluateStepTimeout`. Supports expression interpolation (e.g. `${{ env.MY_TIMEOUT }}`). - **`jobs.<job_id>.continue-on-error`** — evaluates the expression when a job fails. If all failing matrix combinations had `continue-on-error: true`, the job does not cause the workflow run to fail (`handleFailure` skips it), and the tolerated failure reports `success` to dependent jobs through the `needs` context so jobs gated on the default `if: success()` still run (matching GitHub). The "any firm failure wins" rule is serialised under the existing per-job lock, so parallel matrix combinations are safe. Both features follow the same patterns already used at the step level (`evaluateStepTimeout` / `isContinueOnError` in `act/runner/step.go`). ## Version compatibility These changes are backward compatible. With mismatched versions the feature degrades silently to the previous behaviour (field ignored) — no errors on either side. - `timeout-minutes`: runner-only, no server dependency. - `continue-on-error`: requires both this runner PR and the matching Gitea server PR to take full effect. With only one side updated, the field continues to be ignored. Related: [Github](https://github.com/go-gitea/gitea/pull/38100) --------- Co-authored-by: silverwind <2021+silverwind@noreply.gitea.com> Co-authored-by: silverwind <me@silverwind.io> Reviewed-on: https://gitea.com/gitea/runner/pulls/1032 Reviewed-by: silverwind <2021+silverwind@noreply.gitea.com> Reviewed-by: Zettat123 <39446+zettat123@noreply.gitea.com>
2026-06-22 01:34:25 +02:00 · 2026-06-21 17:05:36 +00:00
parent 007717956a
commit 6bdcb54828
7 changed files with 355 additions and 24 deletions
--- a/act/exprparser/functions.go
+++ b/act/exprparser/functions.go
@@ -266,7 +266,7 @@ func (impl *interperterImpl) jobSuccess() (bool, error) { //nolint:unparam // pr
 	jobNeeds := impl.getNeedsTransitive(impl.config.Run.Job())
 	for _, needs := range jobNeeds {
-		if jobs[needs].Result != "success" {
+		if jobs[needs].NeedsResult() != "success" {
 			return false, nil
 		}
 	}
@@ -283,7 +283,7 @@ func (impl *interperterImpl) jobFailure() (bool, error) { //nolint:unparam // pr
 	jobNeeds := impl.getNeedsTransitive(impl.config.Run.Job())
 	for _, needs := range jobNeeds {
-		if jobs[needs].Result == "failure" {
+		if jobs[needs].NeedsResult() == "failure" {
 			return true, nil
 		}
 	}
--- a/act/model/workflow.go
+++ b/act/model/workflow.go
@@ -190,23 +190,52 @@ func (w *Workflow) WorkflowCallConfig() *WorkflowCall {
 // Job is the structure of one job in a workflow
 type Job struct {
-	Name           string                    `yaml:"name"`
+	Name               string                    `yaml:"name"`
-	RawNeeds       yaml.Node                 `yaml:"needs"`
+	RawNeeds           yaml.Node                 `yaml:"needs"`
-	RawRunsOn      yaml.Node                 `yaml:"runs-on"`
+	RawRunsOn          yaml.Node                 `yaml:"runs-on"`
-	Env            yaml.Node                 `yaml:"env"`
+	Env                yaml.Node                 `yaml:"env"`
-	If             yaml.Node                 `yaml:"if"`
+	If                 yaml.Node                 `yaml:"if"`
-	Steps          []*Step                   `yaml:"steps"`
+	Steps              []*Step                   `yaml:"steps"`
-	TimeoutMinutes string                    `yaml:"timeout-minutes"`
+	TimeoutMinutes     string                    `yaml:"timeout-minutes"`
-	Services       map[string]*ContainerSpec `yaml:"services"`
+	RawContinueOnError string                    `yaml:"continue-on-error"`
-	Strategy       *Strategy                 `yaml:"strategy"`
+	Services           map[string]*ContainerSpec `yaml:"services"`
-	RawContainer   yaml.Node                 `yaml:"container"`
+	Strategy           *Strategy                 `yaml:"strategy"`
-	Defaults       Defaults                  `yaml:"defaults"`
+	RawContainer       yaml.Node                 `yaml:"container"`
-	Outputs        map[string]string         `yaml:"outputs"`
+	Defaults           Defaults                  `yaml:"defaults"`
-	Uses           string                    `yaml:"uses"`
+	Outputs            map[string]string         `yaml:"outputs"`
-	With           map[string]any            `yaml:"with"`
+	Uses               string                    `yaml:"uses"`
-	RawSecrets     yaml.Node                 `yaml:"secrets"`
+	With               map[string]any            `yaml:"with"`
-	RawPermissions yaml.Node                 `yaml:"permissions"`
+	RawSecrets         yaml.Node                 `yaml:"secrets"`
-	Result         string
+	RawPermissions     yaml.Node                 `yaml:"permissions"`
 	Result             string
 	// Runtime fields set during execution (not from YAML):
 	ContinueOnError bool // true when all failing matrix combinations had continue-on-error=true
 	hasFirmFailure  bool // true once any combination failed without continue-on-error
 }
 // SetContinueOnError records whether this combination's failure should not fail the workflow.
 // Must be called under the job lock. Safe across parallel matrix combinations.
 func (j *Job) SetContinueOnError(continueOnErr bool) {
 	if continueOnErr {
 		if !j.hasFirmFailure {
 			j.ContinueOnError = true
 		}
 	} else {
 		j.hasFirmFailure = true
 		j.ContinueOnError = false
 	}
 }
 // NeedsResult returns the job result as seen by dependent jobs through the
 // `needs` context. A job that failed but was tolerated via continue-on-error
 // reports "success" to its dependents, matching GitHub: such a failure must not
 // block jobs gated on the default `if: success()`, even though the overall
 // workflow run is still marked as failed.
 func (j *Job) NeedsResult() string {
 	if j.Result == "failure" && j.ContinueOnError {
 		return "success"
 	}
 	return j.Result
 }
 // Strategy for the job
--- a/act/model/workflow_test.go
+++ b/act/model/workflow_test.go
@@ -32,6 +32,32 @@ func TestStepCloneIsolatesMutableFields(t *testing.T) {
 	assert.Equal(t, "original", orig.With["arg"], "With map must not be shared with the clone")
 }
 // TestJobNeedsResult guards the continue-on-error semantics exposed to dependent
 // jobs through the `needs` context: a failed-but-tolerated job reports "success"
 // so it does not block dependents gated on the default `if: success()`, matching
 // GitHub. A firm failure and any non-failure result are reported verbatim.
 func TestJobNeedsResult(t *testing.T) {
 	cases := []struct {
 		name            string
 		result          string
 		continueOnError bool
 		want            string
 	}{
 		{"tolerated failure reports success", "failure", true, "success"},
 		{"firm failure reports failure", "failure", false, "failure"},
 		{"success is unchanged", "success", false, "success"},
 		{"success with continue-on-error is unchanged", "success", true, "success"},
 		{"empty result is unchanged", "", true, ""},
 		{"skipped is unchanged", "skipped", true, "skipped"},
 	}
 	for _, tc := range cases {
 		t.Run(tc.name, func(t *testing.T) {
 			j := &Job{Result: tc.result, ContinueOnError: tc.continueOnError}
 			assert.Equal(t, tc.want, j.NeedsResult())
 		})
 	}
 }
 func TestReadWorkflow_ScheduleEvent(t *testing.T) {
 	yaml := `
 name: local-action-docker-url
--- a/act/runner/expression.go
+++ b/act/runner/expression.go
@@ -56,7 +56,7 @@ func (rc *RunContext) NewExpressionEvaluatorWithEnv(ctx context.Context, env map
 		for _, needs := range jobNeeds {
 			using[needs] = exprparser.Needs{
 				Outputs: jobs[needs].Outputs,
-				Result:  jobs[needs].Result,
+				Result:  jobs[needs].NeedsResult(),
 			}
 		}
@@ -127,7 +127,7 @@ func (rc *RunContext) NewStepExpressionEvaluator(ctx context.Context, step step)
 	for _, needs := range jobNeeds {
 		using[needs] = exprparser.Needs{
 			Outputs: jobs[needs].Outputs,
-			Result:  jobs[needs].Result,
+			Result:  jobs[needs].NeedsResult(),
 		}
 	}
--- a/act/runner/job_executor.go
+++ b/act/runner/job_executor.go
@@ -22,6 +22,7 @@ import (
 	"gitea.com/gitea/runner/act/common"
 	"gitea.com/gitea/runner/act/container"
 	"gitea.com/gitea/runner/act/exprparser"
 	"gitea.com/gitea/runner/act/model"
 )
@@ -204,11 +205,21 @@ func newJobExecutor(info jobInfo, sf stepFactory, rc *RunContext) common.Executo
 	return common.NewPipelineExecutor(info.startContainer(), common.NewPipelineExecutor(pipeline...).
 		Finally(func(ctx context.Context) error {
 			var cancel context.CancelFunc
-			if ctx.Err() == context.Canceled {
+			switch ctx.Err() {
 			case context.Canceled:
 				// in case of an aborted run, we still should execute the
 				// post steps to allow cleanup.
 				ctx, cancel = context.WithTimeout(common.WithLogger(context.Background(), common.Logger(ctx)), 5*time.Minute)
 				defer cancel()
 			case context.DeadlineExceeded:
 				// The job hit its timeout-minutes. Without a fresh context the post
 				// steps would run against the already-expired context and be skipped,
 				// so cleanup post-hooks (e.g. actions/checkout post, cache save) would
 				// not run. Derive the context with WithoutCancel so the new deadline
 				// applies but the job error state is preserved: the job is still
 				// reported as failed and container teardown matches a normal failure.
 				ctx, cancel = context.WithTimeout(context.WithoutCancel(ctx), 5*time.Minute)
 				defer cancel()
 			}
 			return postExecutor(ctx)
 		}).
@@ -223,6 +234,12 @@ func setJobResult(ctx context.Context, info jobInfo, rc *RunContext, success boo
 	// read-modify-write of the job result so a failing combination is not lost-updated by a
 	// concurrent succeeding one.
 	job := rc.Run.Job()
 	var continueOnError bool
 	if !success {
 		// Use a fresh context so an expired job timeout cannot block expression evaluation.
 		evalCtx := common.WithLogger(context.Background(), common.Logger(ctx))
 		continueOnError = evaluateJobContinueOnError(evalCtx, rc, job)
 	}
 	jobResult := func() string {
 		defer lockJob(job)()
 		result := "success"
@@ -233,6 +250,7 @@ func setJobResult(ctx context.Context, info jobInfo, rc *RunContext, success boo
 		}
 		if !success {
 			result = "failure"
 			job.SetContinueOnError(continueOnError)
 		}
 		info.result(result)
 		return result
@@ -271,6 +289,32 @@ func setJobOutputs(ctx context.Context, rc *RunContext) {
 	}
 }
 // applyJobTimeout applies the job-level timeout-minutes to ctx, mirroring the
 // step-level evaluateStepTimeout in step.go.
 func applyJobTimeout(ctx context.Context, rc *RunContext, job *model.Job) (context.Context, context.CancelFunc) {
 	timeout := rc.ExprEval.Interpolate(ctx, job.TimeoutMinutes)
 	if timeout != "" {
 		if timeoutMinutes, err := strconv.ParseInt(timeout, 10, 64); err == nil {
 			return context.WithTimeout(ctx, time.Duration(timeoutMinutes)*time.Minute)
 		}
 	}
 	return ctx, func() {}
 }
 // evaluateJobContinueOnError evaluates the job-level continue-on-error expression.
 func evaluateJobContinueOnError(ctx context.Context, rc *RunContext, job *model.Job) bool {
 	expr := strings.TrimSpace(job.RawContinueOnError)
 	if expr == "" {
 		return false
 	}
 	continueOnError, err := EvalBool(ctx, rc.NewExpressionEvaluator(ctx), expr, exprparser.DefaultStatusCheckNone)
 	if err != nil {
 		common.Logger(ctx).Warnf("continue-on-error expression %q evaluation failed: %v", expr, err)
 		return false
 	}
 	return continueOnError
 }
 func tryUploadJobSummary(ctx context.Context, rc *RunContext) {
 	if rc == nil || rc.JobContainer == nil || rc.Config == nil {
 		return
--- a/act/runner/job_executor_test.go
+++ b/act/runner/job_executor_test.go
@@ -28,6 +28,7 @@ import (
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/mock"
 	"github.com/stretchr/testify/require"
 	yaml "go.yaml.in/yaml/v4"
 )
 func TestJobExecutor(t *testing.T) {
@@ -347,6 +348,133 @@ func TestNewJobExecutor(t *testing.T) {
 	}
 }
 // TestNewJobExecutorRunsPostStepsAfterTimeout guards the timeout-minutes cleanup
 // path: when a job exceeds its timeout the job context is DeadlineExceeded, but
 // the post steps (cleanup hooks like actions/checkout post and cache save) must
 // still run against a fresh, non-expired context, and the job must still be
 // reported as failed.
 func TestNewJobExecutorRunsPostStepsAfterTimeout(t *testing.T) {
 	ctx := common.WithJobErrorContainer(context.Background())
 	// The timeout is generous so the main step (which blocks on ctx.Done below) is
 	// always reached before the deadline fires; otherwise the pipeline would
 	// short-circuit before the step runs and the job error would never be set.
 	ctx, cancel := context.WithTimeout(ctx, 200*time.Millisecond)
 	defer cancel()
 	jim := &jobInfoMock{}
 	sfm := &stepFactoryMock{}
 	rc := &RunContext{
 		JobContainer: &jobContainerMock{},
 		Run: &model.Run{
 			JobID: "test",
 			Workflow: &model.Workflow{
 				Jobs: map[string]*model.Job{
 					"test": {},
 				},
 			},
 		},
 		Config: &Config{},
 	}
 	rc.ExprEval = rc.NewExpressionEvaluator(ctx)
 	stepModel := &model.Step{ID: "1"}
 	jim.On("steps").Return([]*model.Step{stepModel})
 	jim.On("matrix").Return(map[string]any{})
 	jim.On("startContainer").Return(func(ctx context.Context) error { return nil })
 	jim.On("interpolateOutputs").Return(func(ctx context.Context) error { return nil })
 	jim.On("closeContainer").Return(func(ctx context.Context) error { return nil })
 	// The job timed out, so it must be reported as failed. stopContainer is left
 	// unexpected on purpose: a timed-out (failed) job preserves its error state, so
 	// the graceful stop is skipped exactly like any other failure without AutoRemove.
 	jim.On("result", "failure")
 	sm := &stepMock{}
 	sfm.On("newStep", stepModel, rc).Return(sm, nil)
 	sm.On("pre").Return(func(ctx context.Context) error { return nil })
 	// The main step runs past the job timeout: it blocks until the job context is
 	// done, mirroring a step that overruns timeout-minutes.
 	sm.On("main").Return(func(ctx context.Context) error {
 		<-ctx.Done()
 		return ctx.Err()
 	})
 	var postRan bool
 	var postCtxErr error
 	sm.On("post").Return(func(ctx context.Context) error {
 		postRan = true
 		postCtxErr = ctx.Err()
 		return nil
 	})
 	executor := newJobExecutor(jim, sfm, rc)
 	// The executor itself returns nil on timeout: the failure is surfaced through
 	// the job result ("failure", asserted via the result mock below), not the
 	// return value.
 	require.NoError(t, executor(ctx))
 	assert.True(t, postRan, "post step must run after a job timeout")
 	require.NoError(t, postCtxErr, "post step must run against a fresh, non-expired context")
 	jim.AssertExpectations(t)
 	sfm.AssertExpectations(t)
 	sm.AssertExpectations(t)
 }
 // TestSetJobResultMatrixContinueOnError exercises the parallel-matrix path
 // end-to-end: two combinations share one *model.Job and continue-on-error is
 // keyed on matrix.experimental, so one combination tolerates its failure and the
 // other does not. The job is reported as continue-on-error only when EVERY failing
 // combination was tolerated; a single firm failure makes the whole job firm, and
 // handleFailure then fails the run.
 func TestSetJobResultMatrixContinueOnError(t *testing.T) {
 	const jobYAML = "continue-on-error: ${{ matrix.experimental }}\nruns-on: ubuntu-latest"
 	newSharedJob := func(t *testing.T) (*model.Job, *model.Workflow) {
 		t.Helper()
 		var job *model.Job
 		require.NoError(t, yaml.Unmarshal([]byte(jobYAML), &job))
 		return job, &model.Workflow{
 			Name: "workflow1",
 			Jobs: map[string]*model.Job{"job1": job},
 		}
 	}
 	planFor := func(wf *model.Workflow) *model.Plan {
 		return &model.Plan{Stages: []*model.Stage{{Runs: []*model.Run{{Workflow: wf, JobID: "job1"}}}}}
 	}
 	ctx := context.Background()
 	// fail drives a single matrix combination through the failure path; each
 	// RunContext is its own jobInfo (rc implements jobInfo) and shares the job.
 	fail := func(wf *model.Workflow, experimental bool) {
 		rc := newTestRC(wf, map[string]any{"experimental": experimental})
 		setJobResult(ctx, rc, rc, false)
 	}
 	t.Run("one tolerated and one firm failure fails the run", func(t *testing.T) {
 		job, wf := newSharedJob(t)
 		// Order is intentional: the tolerated combination finishes first, then the
 		// firm one. The firm-failure latch must still win regardless of order.
 		fail(wf, true)
 		fail(wf, false)
 		assert.Equal(t, "failure", job.Result)
 		assert.False(t, job.ContinueOnError, "a single firm failure must make the whole job firm")
 		assert.Error(t, handleFailure(planFor(wf))(ctx))
 	})
 	t.Run("all tolerated failures do not fail the run", func(t *testing.T) {
 		job, wf := newSharedJob(t)
 		fail(wf, true)
 		fail(wf, true)
 		assert.Equal(t, "failure", job.Result)
 		assert.True(t, job.ContinueOnError, "every failing combination was tolerated")
 		assert.NoError(t, handleFailure(planFor(wf))(ctx))
 	})
 }
 func TestHasJobSummaryCapability(t *testing.T) {
 	assert.True(t, hasJobSummaryCapability("cache,job-summary artifacts"))
 	assert.True(t, hasJobSummaryCapability("cache,\njob-summary\tartifacts"))
@@ -674,3 +802,104 @@ func tarArchive(t *testing.T, entries ...tarEntry) []byte {
 	require.NoError(t, tw.Close())
 	return buf.Bytes()
 }
 func newTestRC(wf *model.Workflow, matrix map[string]any) *RunContext {
 	return &RunContext{
 		Config: &Config{
 			Workdir: ".",
 			Platforms: map[string]string{
 				"ubuntu-latest": "ubuntu-latest",
 			},
 		},
 		StepResults: map[string]*model.StepResult{},
 		Env:         map[string]string{},
 		Matrix:      matrix,
 		Run:         &model.Run{JobID: "job1", Workflow: wf},
 	}
 }
 func makeTestRC(t *testing.T, jobYAML string) *RunContext {
 	t.Helper()
 	var job *model.Job
 	require.NoError(t, yaml.Unmarshal([]byte(jobYAML), &job))
 	rc := newTestRC(&model.Workflow{
 		Name: "workflow1",
 		Jobs: map[string]*model.Job{"job1": job},
 	}, nil)
 	rc.ExprEval = rc.NewExpressionEvaluator(context.Background())
 	return rc
 }
 func TestApplyJobTimeout(t *testing.T) {
 	cases := []struct {
 		name        string
 		yaml        string
 		wantTimeout bool
 	}{
 		{"empty", "runs-on: ubuntu-latest", false},
 		{"integer", "timeout-minutes: 5\nruns-on: ubuntu-latest", true},
 		{"non-numeric ignored", "timeout-minutes: abc\nruns-on: ubuntu-latest", false},
 	}
 	for _, tc := range cases {
 		t.Run(tc.name, func(t *testing.T) {
 			rc := makeTestRC(t, tc.yaml)
 			ctx := context.Background()
 			newCtx, cancel := applyJobTimeout(ctx, rc, rc.Run.Job())
 			defer cancel()
 			_, hasDeadline := newCtx.Deadline()
 			assert.Equal(t, tc.wantTimeout, hasDeadline)
 		})
 	}
 }
 func TestEvaluateJobContinueOnError(t *testing.T) {
 	cases := []struct {
 		name string
 		yaml string
 		want bool
 	}{
 		{"absent", "runs-on: ubuntu-latest", false},
 		{"true", "continue-on-error: true\nruns-on: ubuntu-latest", true},
 		{"false", "continue-on-error: false\nruns-on: ubuntu-latest", false},
 		{"expression true", "continue-on-error: ${{ 'x' == 'x' }}\nruns-on: ubuntu-latest", true},
 		{"expression false", "continue-on-error: ${{ 'x' != 'x' }}\nruns-on: ubuntu-latest", false},
 	}
 	for _, tc := range cases {
 		t.Run(tc.name, func(t *testing.T) {
 			rc := makeTestRC(t, tc.yaml)
 			got := evaluateJobContinueOnError(context.Background(), rc, rc.Run.Job())
 			assert.Equal(t, tc.want, got)
 		})
 	}
 }
 func TestJobSetContinueOnError(t *testing.T) {
 	t.Run("first call true", func(t *testing.T) {
 		j := &model.Job{}
 		j.SetContinueOnError(true)
 		assert.True(t, j.ContinueOnError)
 	})
 	t.Run("first call false", func(t *testing.T) {
 		j := &model.Job{}
 		j.SetContinueOnError(false)
 		assert.False(t, j.ContinueOnError)
 	})
 	t.Run("true then false locks to false", func(t *testing.T) {
 		j := &model.Job{}
 		j.SetContinueOnError(true)
 		j.SetContinueOnError(false)
 		assert.False(t, j.ContinueOnError)
 	})
 	t.Run("false then true stays false", func(t *testing.T) {
 		j := &model.Job{}
 		j.SetContinueOnError(false)
 		j.SetContinueOnError(true)
 		assert.False(t, j.ContinueOnError)
 	})
 	t.Run("true then true stays true", func(t *testing.T) {
 		j := &model.Job{}
 		j.SetContinueOnError(true)
 		j.SetContinueOnError(true)
 		assert.True(t, j.ContinueOnError)
 	})
 }
--- a/act/runner/runner.go
+++ b/act/runner/runner.go
@@ -250,7 +250,10 @@ func (runner *runnerImpl) NewPlanExecutor(plan *model.Plan) common.Executor {
 							return err
 						}
-						return executor(common.WithJobErrorContainer(WithJobLogger(ctx, rc.Run.JobID, jobName, rc.Config, &rc.Masks, matrix)))
+						jobCtx := common.WithJobErrorContainer(WithJobLogger(ctx, rc.Run.JobID, jobName, rc.Config, &rc.Masks, matrix))
 						jobCtx, cancelTimeout := applyJobTimeout(jobCtx, rc, job)
 						defer cancelTimeout()
 						return executor(jobCtx)
 					})
 				}
 				// Run all matrix combinations of this job, then drop its aggregation mutex: the
@@ -305,7 +308,7 @@ func handleFailure(plan *model.Plan) common.Executor {
 	return func(ctx context.Context) error {
 		for _, stage := range plan.Stages {
 			for _, run := range stage.Runs {
-				if run.Job().Result == "failure" {
+				if run.Job().Result == "failure" && !run.Job().ContinueOnError {
 					return fmt.Errorf("Job '%s' failed", run.String())
 				}
 			}