From 6bdcb548282d464eed228183b9e3e6bbb1ab6e27 Mon Sep 17 00:00:00 2001
From: Nicolas <bircni@icloud.com>
Date: Sun, 21 Jun 2026 17:05:36 +0000
Subject: [PATCH] feat: Enable `jobs.<job_id>.timeout-minutes` and
 `jobs.<job_id>.continue-on-error` (#1032)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two `jobs.<job_id>` workflow syntax fields were parsed from YAML but silently ignored. This PR implements both:

- **`jobs.<job_id>.timeout-minutes`** — applies a context deadline around the entire job execution (container start, pre-steps, main steps, post-steps). Mirrors the existing step-level `evaluateStepTimeout`. Supports expression interpolation (e.g. `${{ env.MY_TIMEOUT }}`).

- **`jobs.<job_id>.continue-on-error`** — evaluates the expression when a job fails. If all failing matrix combinations had `continue-on-error: true`, the job does not cause the workflow run to fail (`handleFailure` skips it), and the tolerated failure reports `success` to dependent jobs through the `needs` context so jobs gated on the default `if: success()` still run (matching GitHub). The "any firm failure wins" rule is serialised under the existing per-job lock, so parallel matrix combinations are safe.

Both features follow the same patterns already used at the step level (`evaluateStepTimeout` / `isContinueOnError` in `act/runner/step.go`).

## Version compatibility

These changes are backward compatible. With mismatched versions the feature degrades silently to the previous behaviour (field ignored) — no errors on either side.

- `timeout-minutes`: runner-only, no server dependency.
- `continue-on-error`: requires both this runner PR and the matching Gitea server PR to take full effect. With only one side updated, the field continues to be ignored.

Related: [Github](https://github.com/go-gitea/gitea/pull/38100)
---------

Co-authored-by: silverwind <2021+silverwind@noreply.gitea.com>
Co-authored-by: silverwind <me@silverwind.io>
Reviewed-on: https://gitea.com/gitea/runner/pulls/1032
Reviewed-by: silverwind <2021+silverwind@noreply.gitea.com>
Reviewed-by: Zettat123 <39446+zettat123@noreply.gitea.com>
---
 act/exprparser/functions.go     |   4 +-
 act/model/workflow.go           |  63 ++++++---
 act/model/workflow_test.go      |  26 ++++
 act/runner/expression.go        |   4 +-
 act/runner/job_executor.go      |  46 ++++++-
 act/runner/job_executor_test.go | 229 ++++++++++++++++++++++++++++++++
 act/runner/runner.go            |   7 +-
 7 files changed, 355 insertions(+), 24 deletions(-)

diff --git a/act/exprparser/functions.go b/act/exprparser/functions.go
index 4243abe2..294227c4 100644
--- a/act/exprparser/functions.go
+++ b/act/exprparser/functions.go
@@ -266,7 +266,7 @@ func (impl *interperterImpl) jobSuccess() (bool, error) { //nolint:unparam // pr
 	jobNeeds := impl.getNeedsTransitive(impl.config.Run.Job())
 
 	for _, needs := range jobNeeds {
-		if jobs[needs].Result != "success" {
+		if jobs[needs].NeedsResult() != "success" {
 			return false, nil
 		}
 	}
@@ -283,7 +283,7 @@ func (impl *interperterImpl) jobFailure() (bool, error) { //nolint:unparam // pr
 	jobNeeds := impl.getNeedsTransitive(impl.config.Run.Job())
 
 	for _, needs := range jobNeeds {
-		if jobs[needs].Result == "failure" {
+		if jobs[needs].NeedsResult() == "failure" {
 			return true, nil
 		}
 	}
diff --git a/act/model/workflow.go b/act/model/workflow.go
index 8327c800..01ba049f 100644
--- a/act/model/workflow.go
+++ b/act/model/workflow.go
@@ -190,23 +190,52 @@ func (w *Workflow) WorkflowCallConfig() *WorkflowCall {
 
 // Job is the structure of one job in a workflow
 type Job struct {
-	Name           string                    `yaml:"name"`
-	RawNeeds       yaml.Node                 `yaml:"needs"`
-	RawRunsOn      yaml.Node                 `yaml:"runs-on"`
-	Env            yaml.Node                 `yaml:"env"`
-	If             yaml.Node                 `yaml:"if"`
-	Steps          []*Step                   `yaml:"steps"`
-	TimeoutMinutes string                    `yaml:"timeout-minutes"`
-	Services       map[string]*ContainerSpec `yaml:"services"`
-	Strategy       *Strategy                 `yaml:"strategy"`
-	RawContainer   yaml.Node                 `yaml:"container"`
-	Defaults       Defaults                  `yaml:"defaults"`
-	Outputs        map[string]string         `yaml:"outputs"`
-	Uses           string                    `yaml:"uses"`
-	With           map[string]any            `yaml:"with"`
-	RawSecrets     yaml.Node                 `yaml:"secrets"`
-	RawPermissions yaml.Node                 `yaml:"permissions"`
-	Result         string
+	Name               string                    `yaml:"name"`
+	RawNeeds           yaml.Node                 `yaml:"needs"`
+	RawRunsOn          yaml.Node                 `yaml:"runs-on"`
+	Env                yaml.Node                 `yaml:"env"`
+	If                 yaml.Node                 `yaml:"if"`
+	Steps              []*Step                   `yaml:"steps"`
+	TimeoutMinutes     string                    `yaml:"timeout-minutes"`
+	RawContinueOnError string                    `yaml:"continue-on-error"`
+	Services           map[string]*ContainerSpec `yaml:"services"`
+	Strategy           *Strategy                 `yaml:"strategy"`
+	RawContainer       yaml.Node                 `yaml:"container"`
+	Defaults           Defaults                  `yaml:"defaults"`
+	Outputs            map[string]string         `yaml:"outputs"`
+	Uses               string                    `yaml:"uses"`
+	With               map[string]any            `yaml:"with"`
+	RawSecrets         yaml.Node                 `yaml:"secrets"`
+	RawPermissions     yaml.Node                 `yaml:"permissions"`
+	Result             string
+	// Runtime fields set during execution (not from YAML):
+	ContinueOnError bool // true when all failing matrix combinations had continue-on-error=true
+	hasFirmFailure  bool // true once any combination failed without continue-on-error
+}
+
+// SetContinueOnError records whether this combination's failure should not fail the workflow.
+// Must be called under the job lock. Safe across parallel matrix combinations.
+func (j *Job) SetContinueOnError(continueOnErr bool) {
+	if continueOnErr {
+		if !j.hasFirmFailure {
+			j.ContinueOnError = true
+		}
+	} else {
+		j.hasFirmFailure = true
+		j.ContinueOnError = false
+	}
+}
+
+// NeedsResult returns the job result as seen by dependent jobs through the
+// `needs` context. A job that failed but was tolerated via continue-on-error
+// reports "success" to its dependents, matching GitHub: such a failure must not
+// block jobs gated on the default `if: success()`, even though the overall
+// workflow run is still marked as failed.
+func (j *Job) NeedsResult() string {
+	if j.Result == "failure" && j.ContinueOnError {
+		return "success"
+	}
+	return j.Result
 }
 
 // Strategy for the job
diff --git a/act/model/workflow_test.go b/act/model/workflow_test.go
index f08f8b3b..3e66d197 100644
--- a/act/model/workflow_test.go
+++ b/act/model/workflow_test.go
@@ -32,6 +32,32 @@ func TestStepCloneIsolatesMutableFields(t *testing.T) {
 	assert.Equal(t, "original", orig.With["arg"], "With map must not be shared with the clone")
 }
 
+// TestJobNeedsResult guards the continue-on-error semantics exposed to dependent
+// jobs through the `needs` context: a failed-but-tolerated job reports "success"
+// so it does not block dependents gated on the default `if: success()`, matching
+// GitHub. A firm failure and any non-failure result are reported verbatim.
+func TestJobNeedsResult(t *testing.T) {
+	cases := []struct {
+		name            string
+		result          string
+		continueOnError bool
+		want            string
+	}{
+		{"tolerated failure reports success", "failure", true, "success"},
+		{"firm failure reports failure", "failure", false, "failure"},
+		{"success is unchanged", "success", false, "success"},
+		{"success with continue-on-error is unchanged", "success", true, "success"},
+		{"empty result is unchanged", "", true, ""},
+		{"skipped is unchanged", "skipped", true, "skipped"},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			j := &Job{Result: tc.result, ContinueOnError: tc.continueOnError}
+			assert.Equal(t, tc.want, j.NeedsResult())
+		})
+	}
+}
+
 func TestReadWorkflow_ScheduleEvent(t *testing.T) {
 	yaml := `
 name: local-action-docker-url
diff --git a/act/runner/expression.go b/act/runner/expression.go
index ba935daf..c64f921b 100644
--- a/act/runner/expression.go
+++ b/act/runner/expression.go
@@ -56,7 +56,7 @@ func (rc *RunContext) NewExpressionEvaluatorWithEnv(ctx context.Context, env map
 		for _, needs := range jobNeeds {
 			using[needs] = exprparser.Needs{
 				Outputs: jobs[needs].Outputs,
-				Result:  jobs[needs].Result,
+				Result:  jobs[needs].NeedsResult(),
 			}
 		}
 
@@ -127,7 +127,7 @@ func (rc *RunContext) NewStepExpressionEvaluator(ctx context.Context, step step)
 	for _, needs := range jobNeeds {
 		using[needs] = exprparser.Needs{
 			Outputs: jobs[needs].Outputs,
-			Result:  jobs[needs].Result,
+			Result:  jobs[needs].NeedsResult(),
 		}
 	}
 
diff --git a/act/runner/job_executor.go b/act/runner/job_executor.go
index b8f7ccfd..752f74a7 100644
--- a/act/runner/job_executor.go
+++ b/act/runner/job_executor.go
@@ -22,6 +22,7 @@ import (
 
 	"gitea.com/gitea/runner/act/common"
 	"gitea.com/gitea/runner/act/container"
+	"gitea.com/gitea/runner/act/exprparser"
 	"gitea.com/gitea/runner/act/model"
 )
 
@@ -204,11 +205,21 @@ func newJobExecutor(info jobInfo, sf stepFactory, rc *RunContext) common.Executo
 	return common.NewPipelineExecutor(info.startContainer(), common.NewPipelineExecutor(pipeline...).
 		Finally(func(ctx context.Context) error {
 			var cancel context.CancelFunc
-			if ctx.Err() == context.Canceled {
+			switch ctx.Err() {
+			case context.Canceled:
 				// in case of an aborted run, we still should execute the
 				// post steps to allow cleanup.
 				ctx, cancel = context.WithTimeout(common.WithLogger(context.Background(), common.Logger(ctx)), 5*time.Minute)
 				defer cancel()
+			case context.DeadlineExceeded:
+				// The job hit its timeout-minutes. Without a fresh context the post
+				// steps would run against the already-expired context and be skipped,
+				// so cleanup post-hooks (e.g. actions/checkout post, cache save) would
+				// not run. Derive the context with WithoutCancel so the new deadline
+				// applies but the job error state is preserved: the job is still
+				// reported as failed and container teardown matches a normal failure.
+				ctx, cancel = context.WithTimeout(context.WithoutCancel(ctx), 5*time.Minute)
+				defer cancel()
 			}
 			return postExecutor(ctx)
 		}).
@@ -223,6 +234,12 @@ func setJobResult(ctx context.Context, info jobInfo, rc *RunContext, success boo
 	// read-modify-write of the job result so a failing combination is not lost-updated by a
 	// concurrent succeeding one.
 	job := rc.Run.Job()
+	var continueOnError bool
+	if !success {
+		// Use a fresh context so an expired job timeout cannot block expression evaluation.
+		evalCtx := common.WithLogger(context.Background(), common.Logger(ctx))
+		continueOnError = evaluateJobContinueOnError(evalCtx, rc, job)
+	}
 	jobResult := func() string {
 		defer lockJob(job)()
 		result := "success"
@@ -233,6 +250,7 @@ func setJobResult(ctx context.Context, info jobInfo, rc *RunContext, success boo
 		}
 		if !success {
 			result = "failure"
+			job.SetContinueOnError(continueOnError)
 		}
 		info.result(result)
 		return result
@@ -271,6 +289,32 @@ func setJobOutputs(ctx context.Context, rc *RunContext) {
 	}
 }
 
+// applyJobTimeout applies the job-level timeout-minutes to ctx, mirroring the
+// step-level evaluateStepTimeout in step.go.
+func applyJobTimeout(ctx context.Context, rc *RunContext, job *model.Job) (context.Context, context.CancelFunc) {
+	timeout := rc.ExprEval.Interpolate(ctx, job.TimeoutMinutes)
+	if timeout != "" {
+		if timeoutMinutes, err := strconv.ParseInt(timeout, 10, 64); err == nil {
+			return context.WithTimeout(ctx, time.Duration(timeoutMinutes)*time.Minute)
+		}
+	}
+	return ctx, func() {}
+}
+
+// evaluateJobContinueOnError evaluates the job-level continue-on-error expression.
+func evaluateJobContinueOnError(ctx context.Context, rc *RunContext, job *model.Job) bool {
+	expr := strings.TrimSpace(job.RawContinueOnError)
+	if expr == "" {
+		return false
+	}
+	continueOnError, err := EvalBool(ctx, rc.NewExpressionEvaluator(ctx), expr, exprparser.DefaultStatusCheckNone)
+	if err != nil {
+		common.Logger(ctx).Warnf("continue-on-error expression %q evaluation failed: %v", expr, err)
+		return false
+	}
+	return continueOnError
+}
+
 func tryUploadJobSummary(ctx context.Context, rc *RunContext) {
 	if rc == nil || rc.JobContainer == nil || rc.Config == nil {
 		return
diff --git a/act/runner/job_executor_test.go b/act/runner/job_executor_test.go
index 55228e85..4c1cf7f0 100644
--- a/act/runner/job_executor_test.go
+++ b/act/runner/job_executor_test.go
@@ -28,6 +28,7 @@ import (
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/mock"
 	"github.com/stretchr/testify/require"
+	yaml "go.yaml.in/yaml/v4"
 )
 
 func TestJobExecutor(t *testing.T) {
@@ -347,6 +348,133 @@ func TestNewJobExecutor(t *testing.T) {
 	}
 }
 
+// TestNewJobExecutorRunsPostStepsAfterTimeout guards the timeout-minutes cleanup
+// path: when a job exceeds its timeout the job context is DeadlineExceeded, but
+// the post steps (cleanup hooks like actions/checkout post and cache save) must
+// still run against a fresh, non-expired context, and the job must still be
+// reported as failed.
+func TestNewJobExecutorRunsPostStepsAfterTimeout(t *testing.T) {
+	ctx := common.WithJobErrorContainer(context.Background())
+	// The timeout is generous so the main step (which blocks on ctx.Done below) is
+	// always reached before the deadline fires; otherwise the pipeline would
+	// short-circuit before the step runs and the job error would never be set.
+	ctx, cancel := context.WithTimeout(ctx, 200*time.Millisecond)
+	defer cancel()
+
+	jim := &jobInfoMock{}
+	sfm := &stepFactoryMock{}
+	rc := &RunContext{
+		JobContainer: &jobContainerMock{},
+		Run: &model.Run{
+			JobID: "test",
+			Workflow: &model.Workflow{
+				Jobs: map[string]*model.Job{
+					"test": {},
+				},
+			},
+		},
+		Config: &Config{},
+	}
+	rc.ExprEval = rc.NewExpressionEvaluator(ctx)
+
+	stepModel := &model.Step{ID: "1"}
+	jim.On("steps").Return([]*model.Step{stepModel})
+	jim.On("matrix").Return(map[string]any{})
+	jim.On("startContainer").Return(func(ctx context.Context) error { return nil })
+	jim.On("interpolateOutputs").Return(func(ctx context.Context) error { return nil })
+	jim.On("closeContainer").Return(func(ctx context.Context) error { return nil })
+	// The job timed out, so it must be reported as failed. stopContainer is left
+	// unexpected on purpose: a timed-out (failed) job preserves its error state, so
+	// the graceful stop is skipped exactly like any other failure without AutoRemove.
+	jim.On("result", "failure")
+
+	sm := &stepMock{}
+	sfm.On("newStep", stepModel, rc).Return(sm, nil)
+	sm.On("pre").Return(func(ctx context.Context) error { return nil })
+	// The main step runs past the job timeout: it blocks until the job context is
+	// done, mirroring a step that overruns timeout-minutes.
+	sm.On("main").Return(func(ctx context.Context) error {
+		<-ctx.Done()
+		return ctx.Err()
+	})
+
+	var postRan bool
+	var postCtxErr error
+	sm.On("post").Return(func(ctx context.Context) error {
+		postRan = true
+		postCtxErr = ctx.Err()
+		return nil
+	})
+
+	executor := newJobExecutor(jim, sfm, rc)
+	// The executor itself returns nil on timeout: the failure is surfaced through
+	// the job result ("failure", asserted via the result mock below), not the
+	// return value.
+	require.NoError(t, executor(ctx))
+
+	assert.True(t, postRan, "post step must run after a job timeout")
+	require.NoError(t, postCtxErr, "post step must run against a fresh, non-expired context")
+
+	jim.AssertExpectations(t)
+	sfm.AssertExpectations(t)
+	sm.AssertExpectations(t)
+}
+
+// TestSetJobResultMatrixContinueOnError exercises the parallel-matrix path
+// end-to-end: two combinations share one *model.Job and continue-on-error is
+// keyed on matrix.experimental, so one combination tolerates its failure and the
+// other does not. The job is reported as continue-on-error only when EVERY failing
+// combination was tolerated; a single firm failure makes the whole job firm, and
+// handleFailure then fails the run.
+func TestSetJobResultMatrixContinueOnError(t *testing.T) {
+	const jobYAML = "continue-on-error: ${{ matrix.experimental }}\nruns-on: ubuntu-latest"
+
+	newSharedJob := func(t *testing.T) (*model.Job, *model.Workflow) {
+		t.Helper()
+		var job *model.Job
+		require.NoError(t, yaml.Unmarshal([]byte(jobYAML), &job))
+		return job, &model.Workflow{
+			Name: "workflow1",
+			Jobs: map[string]*model.Job{"job1": job},
+		}
+	}
+
+	planFor := func(wf *model.Workflow) *model.Plan {
+		return &model.Plan{Stages: []*model.Stage{{Runs: []*model.Run{{Workflow: wf, JobID: "job1"}}}}}
+	}
+
+	ctx := context.Background()
+
+	// fail drives a single matrix combination through the failure path; each
+	// RunContext is its own jobInfo (rc implements jobInfo) and shares the job.
+	fail := func(wf *model.Workflow, experimental bool) {
+		rc := newTestRC(wf, map[string]any{"experimental": experimental})
+		setJobResult(ctx, rc, rc, false)
+	}
+
+	t.Run("one tolerated and one firm failure fails the run", func(t *testing.T) {
+		job, wf := newSharedJob(t)
+		// Order is intentional: the tolerated combination finishes first, then the
+		// firm one. The firm-failure latch must still win regardless of order.
+		fail(wf, true)
+		fail(wf, false)
+
+		assert.Equal(t, "failure", job.Result)
+		assert.False(t, job.ContinueOnError, "a single firm failure must make the whole job firm")
+		assert.Error(t, handleFailure(planFor(wf))(ctx))
+	})
+
+	t.Run("all tolerated failures do not fail the run", func(t *testing.T) {
+		job, wf := newSharedJob(t)
+		fail(wf, true)
+		fail(wf, true)
+
+		assert.Equal(t, "failure", job.Result)
+		assert.True(t, job.ContinueOnError, "every failing combination was tolerated")
+		assert.NoError(t, handleFailure(planFor(wf))(ctx))
+	})
+}
+
 func TestHasJobSummaryCapability(t *testing.T) {
 	assert.True(t, hasJobSummaryCapability("cache,job-summary artifacts"))
 	assert.True(t, hasJobSummaryCapability("cache,\njob-summary\tartifacts"))
@@ -674,3 +802,104 @@ func tarArchive(t *testing.T, entries ...tarEntry) []byte {
 	require.NoError(t, tw.Close())
 	return buf.Bytes()
 }
+
+func newTestRC(wf *model.Workflow, matrix map[string]any) *RunContext {
+	return &RunContext{
+		Config: &Config{
+			Workdir: ".",
+			Platforms: map[string]string{
+				"ubuntu-latest": "ubuntu-latest",
+			},
+		},
+		StepResults: map[string]*model.StepResult{},
+		Env:         map[string]string{},
+		Matrix:      matrix,
+		Run:         &model.Run{JobID: "job1", Workflow: wf},
+	}
+}
+
+func makeTestRC(t *testing.T, jobYAML string) *RunContext {
+	t.Helper()
+	var job *model.Job
+	require.NoError(t, yaml.Unmarshal([]byte(jobYAML), &job))
+	rc := newTestRC(&model.Workflow{
+		Name: "workflow1",
+		Jobs: map[string]*model.Job{"job1": job},
+	}, nil)
+	rc.ExprEval = rc.NewExpressionEvaluator(context.Background())
+	return rc
+}
+
+func TestApplyJobTimeout(t *testing.T) {
+	cases := []struct {
+		name        string
+		yaml        string
+		wantTimeout bool
+	}{
+		{"empty", "runs-on: ubuntu-latest", false},
+		{"integer", "timeout-minutes: 5\nruns-on: ubuntu-latest", true},
+		{"non-numeric ignored", "timeout-minutes: abc\nruns-on: ubuntu-latest", false},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			rc := makeTestRC(t, tc.yaml)
+			ctx := context.Background()
+			newCtx, cancel := applyJobTimeout(ctx, rc, rc.Run.Job())
+			defer cancel()
+			_, hasDeadline := newCtx.Deadline()
+			assert.Equal(t, tc.wantTimeout, hasDeadline)
+		})
+	}
+}
+
+func TestEvaluateJobContinueOnError(t *testing.T) {
+	cases := []struct {
+		name string
+		yaml string
+		want bool
+	}{
+		{"absent", "runs-on: ubuntu-latest", false},
+		{"true", "continue-on-error: true\nruns-on: ubuntu-latest", true},
+		{"false", "continue-on-error: false\nruns-on: ubuntu-latest", false},
+		{"expression true", "continue-on-error: ${{ 'x' == 'x' }}\nruns-on: ubuntu-latest", true},
+		{"expression false", "continue-on-error: ${{ 'x' != 'x' }}\nruns-on: ubuntu-latest", false},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			rc := makeTestRC(t, tc.yaml)
+			got := evaluateJobContinueOnError(context.Background(), rc, rc.Run.Job())
+			assert.Equal(t, tc.want, got)
+		})
+	}
+}
+
+func TestJobSetContinueOnError(t *testing.T) {
+	t.Run("first call true", func(t *testing.T) {
+		j := &model.Job{}
+		j.SetContinueOnError(true)
+		assert.True(t, j.ContinueOnError)
+	})
+	t.Run("first call false", func(t *testing.T) {
+		j := &model.Job{}
+		j.SetContinueOnError(false)
+		assert.False(t, j.ContinueOnError)
+	})
+	t.Run("true then false locks to false", func(t *testing.T) {
+		j := &model.Job{}
+		j.SetContinueOnError(true)
+		j.SetContinueOnError(false)
+		assert.False(t, j.ContinueOnError)
+	})
+	t.Run("false then true stays false", func(t *testing.T) {
+		j := &model.Job{}
+		j.SetContinueOnError(false)
+		j.SetContinueOnError(true)
+		assert.False(t, j.ContinueOnError)
+	})
+	t.Run("true then true stays true", func(t *testing.T) {
+		j := &model.Job{}
+		j.SetContinueOnError(true)
+		j.SetContinueOnError(true)
+		assert.True(t, j.ContinueOnError)
+	})
+}
diff --git a/act/runner/runner.go b/act/runner/runner.go
index 4deb683e..ff57c178 100644
--- a/act/runner/runner.go
+++ b/act/runner/runner.go
@@ -250,7 +250,10 @@ func (runner *runnerImpl) NewPlanExecutor(plan *model.Plan) common.Executor {
 							return err
 						}
 
-						return executor(common.WithJobErrorContainer(WithJobLogger(ctx, rc.Run.JobID, jobName, rc.Config, &rc.Masks, matrix)))
+						jobCtx := common.WithJobErrorContainer(WithJobLogger(ctx, rc.Run.JobID, jobName, rc.Config, &rc.Masks, matrix))
+						jobCtx, cancelTimeout := applyJobTimeout(jobCtx, rc, job)
+						defer cancelTimeout()
+						return executor(jobCtx)
 					})
 				}
 				// Run all matrix combinations of this job, then drop its aggregation mutex: the
@@ -305,7 +308,7 @@ func handleFailure(plan *model.Plan) common.Executor {
 	return func(ctx context.Context) error {
 		for _, stage := range plan.Stages {
 			for _, run := range stage.Runs {
-				if run.Job().Result == "failure" {
+				if run.Job().Result == "failure" && !run.Job().ContinueOnError {
 					return fmt.Errorf("Job '%s' failed", run.String())
 				}
 			}