fix: clean up job network and container when container start fails (#986)

The teardown that removes a job's per-job network and container runs as a `Finally` on the step pipeline in `newJobExecutor`, which only executes after a successful start. When the start itself fails (e.g. a `docker cp` error from a buggy daemon), that `Finally` is skipped, so the network and container leak until Docker's address pool is exhausted and later jobs can no longer create networks. This tears them down in `startContainer` when the start returns an error, reusing the existing `cleanUpJobContainer` teardown. Exposed by the daemon regression in https://gitea.com/gitea/runner/issues/981, where every failed `docker cp` leaked a per-job network. --- This PR was written with the help of Claude Opus 4.7 Reviewed-on: https://gitea.com/gitea/runner/pulls/986 Reviewed-by: Nicolas <bircni@icloud.com> Co-authored-by: silverwind <me@silverwind.io> Co-committed-by: silverwind <me@silverwind.io>
2026-06-10 02:54:23 +02:00 · 2026-05-21 15:19:01 +00:00
parent 7b5ebe9618
commit b30204aa94
2 changed files with 77 additions and 2 deletions
--- a/act/runner/run_context.go
+++ b/act/runner/run_context.go
@@ -601,10 +601,34 @@ func (rc *RunContext) interpolateOutputs() common.Executor {

 func (rc *RunContext) startContainer() common.Executor {
 	return func(ctx context.Context) error {
+		var err error
 		if rc.IsHostEnv(ctx) {
-			return rc.startHostEnvironment()(ctx)
+			err = rc.startHostEnvironment()(ctx)
+		} else {
+			err = rc.startJobContainer()(ctx)
 		}
-		return rc.startJobContainer()(ctx)
+		if err != nil {
+			// The job executor's teardown only runs after a successful start, so a failed
+			// start would otherwise leak the per-job network and container.
+			rc.cleanupFailedStart(ctx)
+		}
+		return err
+	}
+}
+
+func (rc *RunContext) cleanupFailedStart(ctx context.Context) {
+	if rc.cleanUpJobContainer == nil {
+		return
+	}
+	cleanCtx := ctx
+	if ctx.Err() != nil {
+		// the start likely failed because ctx was cancelled, detach so teardown still runs
+		var cancel context.CancelFunc
+		cleanCtx, cancel = context.WithTimeout(common.WithLogger(context.Background(), common.Logger(ctx)), time.Minute)
+		defer cancel()
+	}
+	if err := rc.cleanUpJobContainer(cleanCtx); err != nil {
+		common.Logger(ctx).Errorf("Error while cleaning up after failed container start for job %s: %v", rc.JobName, err)
 	}
 }