feat: ipv6 options for network container creation (#1029 )

Here is a final proposal for ipv6 enablement on temporary network created by gitea runner --------- Co-authored-by: Nicolas <bircni@icloud.com> Co-authored-by: Nicolas Schwartz <9308314+StarAurryon@users.noreply.github.com> Reviewed-on: https://gitea.com/gitea/runner/pulls/1029 Reviewed-by: Nicolas <bircni@icloud.com> Co-authored-by: StarAurryon <206206+staraurryon@noreply.gitea.com> Co-committed-by: StarAurryon <206206+staraurryon@noreply.gitea.com>
fix(cleanup): kill Unix step process group on cancel to avoid hang (#1025 )
2026-06-15 14:24:22 +02:00 · 2026-06-15 05:05:20 +00:00 · 2026-06-14 20:52:42 +00:00 · 2026-06-14 20:43:19 +00:00 · 2026-06-14 14:14:43 +00:00
24 changed files with 872 additions and 152 deletions
--- a/act/common/line_writer.go
+++ b/act/common/line_writer.go
@@ -12,6 +12,13 @@ import (
 // LineHandler is a callback function for handling a line
 type LineHandler func(line string) bool
 // Flusher is implemented by writers that buffer a trailing, not-yet-terminated
 // line. Callers should flush once the underlying stream has reached EOF so the
 // final line (when it is not newline-terminated) is not lost.
 type Flusher interface {
 	Flush()
 }
 type lineWriter struct {
 	buffer   bytes.Buffer
 	handlers []LineHandler
@@ -24,6 +31,14 @@ func NewLineWriter(handlers ...LineHandler) io.Writer {
 	return w
 }
 // FlushWriter flushes w if it implements Flusher. It is a no-op otherwise, so
 // callers can flush an io.Writer without knowing its concrete type.
 func FlushWriter(w io.Writer) {
 	if f, ok := w.(Flusher); ok {
 		f.Flush()
 	}
 }
 func (lw *lineWriter) Write(p []byte) (n int, err error) {
 	pBuf := bytes.NewBuffer(p)
 	written := 0
@@ -44,6 +59,17 @@ func (lw *lineWriter) Write(p []byte) (n int, err error) {
 	return written, nil
 }
 // Flush emits any buffered, not-yet-newline-terminated content as a final line.
 // It is safe to call multiple times; subsequent calls with an empty buffer are
 // no-ops.
 func (lw *lineWriter) Flush() {
 	if lw.buffer.Len() == 0 {
 		return
 	}
 	lw.handleLine(lw.buffer.String())
 	lw.buffer.Reset()
 }
 func (lw *lineWriter) handleLine(line string) {
 	for _, h := range lw.handlers {
 		ok := h(line)
--- a/act/common/line_writer_test.go
+++ b/act/common/line_writer_test.go
@@ -5,6 +5,7 @@
 package common
 import (
 	"io"
 	"testing"
 	"github.com/stretchr/testify/assert"
@@ -39,3 +40,33 @@ func TestLineWriter(t *testing.T) {
 	assert.Equal(" and another\n", lines[2])
 	assert.Equal("last line\n", lines[3])
 }
 func TestLineWriterFlush(t *testing.T) {
 	lines := make([]string, 0)
 	lineHandler := func(s string) bool {
 		lines = append(lines, s)
 		return true
 	}
 	lineWriter := NewLineWriter(lineHandler)
 	assert := assert.New(t)
 	_, err := lineWriter.Write([]byte("complete line\npartial line without newline"))
 	assert.NoError(err) //nolint:testifylint // pre-existing pattern from nektos/act
 	// Only the newline-terminated line is emitted before flushing.
 	assert.Equal([]string{"complete line\n"}, lines)
 	// Flushing emits the buffered, not-yet-terminated trailing line.
 	FlushWriter(lineWriter)
 	assert.Equal([]string{"complete line\n", "partial line without newline"}, lines)
 	// Flushing again is a no-op: nothing is buffered.
 	FlushWriter(lineWriter)
 	assert.Len(lines, 2)
 }
 func TestFlushWriterIgnoresNonFlusher(t *testing.T) {
 	// FlushWriter must be a safe no-op for writers that do not buffer lines.
 	assert.NotPanics(t, func() { FlushWriter(io.Discard) })
 }
--- a/act/container/container_types.go
+++ b/act/container/container_types.go
@@ -84,6 +84,12 @@ type NewDockerBuildExecutorInput struct {
 	Platform     string
 }
 // NewDockerNetworkCreateExecutorInput the input for the NewDockerNetworkCreateExecutor function
 type NewDockerNetworkCreateExecutorInput struct {
 	EnableIPv4 *bool
 	EnableIPv6 *bool
 }
 // NewDockerPullExecutorInput the input for the NewDockerPullExecutor function
 type NewDockerPullExecutorInput struct {
 	Image     string
--- a/act/container/docker_network.go
+++ b/act/container/docker_network.go
@@ -14,7 +14,7 @@ import (
 	"github.com/moby/moby/client"
 )
-func NewDockerNetworkCreateExecutor(name string) common.Executor {
+func NewDockerNetworkCreateExecutor(name string, opts NewDockerNetworkCreateExecutorInput) common.Executor {
 	return func(ctx context.Context) error {
 		cli, err := GetDockerClient(ctx)
 		if err != nil {
@@ -39,6 +39,8 @@ func NewDockerNetworkCreateExecutor(name string) common.Executor {
 		_, err = cli.NetworkCreate(ctx, name, client.NetworkCreateOptions{
 			Driver:     "bridge",
 			Scope:      "local",
 			EnableIPv4: opts.EnableIPv4,
 			EnableIPv6: opts.EnableIPv6,
 		})
 		if err != nil {
 			return err
--- a/act/container/docker_run.go
+++ b/act/container/docker_run.go
@@ -20,6 +20,7 @@ import (
 	"slices"
 	"strconv"
 	"strings"
 	"time"
 	"gitea.com/gitea/runner/act/common"
 	"gitea.com/gitea/runner/act/filecollector"
@@ -45,6 +46,13 @@ import (
 	"github.com/spf13/pflag"
 )
 // drainGracePeriod bounds how long we wait for an output-copy goroutine to
 // finish draining a container's output before returning, so that neither a
 // cancellation (waitForCommand) nor a normal container exit (wait) truncates
 // the tail of the log. It is a safety bound: in the common case the stream
 // reaches EOF and the goroutine returns well before this elapses.
 const drainGracePeriod = 2 * time.Second
 // NewContainer creates a reference to a container
 func NewContainer(input *NewContainerInput) ExecutionsEnvironment {
 	cr := new(containerReference)
@@ -229,6 +237,10 @@ type containerReference struct {
 	input *NewContainerInput
 	UID   int
 	GID   int
 	// attachDone is closed by the attach() streaming goroutine once it has
 	// drained and flushed the container's output. wait() blocks on it so the
 	// tail of the log lands before the step proceeds.
 	attachDone chan struct{}
 	LinuxContainerEnvironmentExtensions
 }
@@ -730,7 +742,9 @@ func (cr *containerReference) tryReadGID() common.Executor {
 func (cr *containerReference) waitForCommand(ctx context.Context, isTerminal bool, resp client.HijackedResponse, _ client.ExecCreateResult, _, _ string) error {
 	logger := common.Logger(ctx)
-	cmdResponse := make(chan error)
+	// Buffered so the copy goroutine never blocks on send if the grace-period
 	// drain below times out and no one is left to receive.
 	cmdResponse := make(chan error, 1)
 	go func() {
 		var outWriter io.Writer
@@ -749,6 +763,11 @@ func (cr *containerReference) waitForCommand(ctx context.Context, isTerminal boo
 		} else {
 			_, err = io.Copy(outWriter, resp.Reader)
 		}
 		// Flush any buffered, not-yet-newline-terminated trailing line so the
 		// final line of a command's output is not lost (e.g. an error message
 		// printed without a trailing newline before the process exits).
 		common.FlushWriter(outWriter)
 		common.FlushWriter(errWriter)
 		cmdResponse <- err
 	}()
@@ -760,6 +779,16 @@ func (cr *containerReference) waitForCommand(ctx context.Context, isTerminal boo
 			logger.Warnf("Failed to send CTRL+C: %+s", err)
 		}
 		// Give the copy goroutine a brief grace period to drain output already
 		// produced by the command before we return, so cancellation does not
 		// truncate the tail of the log. The goroutine exits once the hijacked
 		// stream is closed by resp.Close() in the caller's defer.
 		select {
 		case <-cmdResponse:
 		case <-time.After(drainGracePeriod):
 			logger.Warn("Timed out draining command output after cancellation")
 		}
 		// we return the context canceled error to prevent other steps
 		// from executing
 		return ctx.Err()
@@ -945,14 +974,23 @@ func (cr *containerReference) attach() common.Executor {
 		if errWriter == nil {
 			errWriter = os.Stderr
 		}
 		done := make(chan struct{})
 		cr.attachDone = done
 		go func() {
 			defer close(done)
 			var copyErr error
 			if !isTerminal || os.Getenv("NORAW") != "" {
-				_, err = stdcopy.StdCopy(outWriter, errWriter, out.Reader)
+				_, copyErr = stdcopy.StdCopy(outWriter, errWriter, out.Reader)
 			} else {
-				_, err = io.Copy(outWriter, out.Reader)
+				_, copyErr = io.Copy(outWriter, out.Reader)
 			}
-			if err != nil {
+			// Flush any buffered, not-yet-newline-terminated trailing line once
-				common.Logger(ctx).Error(err)
+			// the stream reaches EOF, so the final line of the container's
 			// output is not lost when it is not newline-terminated.
 			common.FlushWriter(outWriter)
 			common.FlushWriter(errWriter)
 			if copyErr != nil {
 				common.Logger(ctx).Error(copyErr)
 			}
 		}()
 		return nil
@@ -991,6 +1029,18 @@ func (cr *containerReference) wait() common.Executor {
 		logger.Debugf("Return status: %v", statusCode)
 		// The container has exited; wait for the attach() streaming goroutine to
 		// finish draining and flushing its output before returning, so the tail
 		// of the log is not lost. Bounded so a stuck stream cannot hang the step.
 		if cr.attachDone != nil {
 			select {
 			case <-cr.attachDone:
 			case <-time.After(drainGracePeriod):
 				logger.Warn("Timed out draining container output")
 			}
 			cr.attachDone = nil
 		}
 		if statusCode == 0 {
 			return nil
 		}
--- a/act/container/docker_run_test.go
+++ b/act/container/docker_run_test.go
@@ -8,6 +8,7 @@ import (
 	"bufio"
 	"bytes"
 	"context"
 	"encoding/binary"
 	"errors"
 	"io"
 	"net"
@@ -20,6 +21,7 @@ import (
 	"gitea.com/gitea/runner/act/common"
 	cerrdefs "github.com/containerd/errdefs"
 	"github.com/moby/moby/api/pkg/stdcopy"
 	"github.com/moby/moby/api/types/container"
 	mobyclient "github.com/moby/moby/client"
 	"github.com/sirupsen/logrus/hooks/test"
@@ -89,6 +91,11 @@ func (m *mockDockerClient) ExecInspect(ctx context.Context, execID string, opts
 	return args.Get(0).(mobyclient.ExecInspectResult), args.Error(1)
 }
 func (m *mockDockerClient) ContainerAttach(ctx context.Context, containerID string, opts mobyclient.ContainerAttachOptions) (mobyclient.ContainerAttachResult, error) {
 	args := m.Called(ctx, containerID, opts)
 	return args.Get(0).(mobyclient.ContainerAttachResult), args.Error(1)
 }
 func (m *mockDockerClient) ContainerWait(ctx context.Context, containerID string, opts mobyclient.ContainerWaitOptions) mobyclient.ContainerWaitResult {
 	args := m.Called(ctx, containerID, opts)
 	return args.Get(0).(mobyclient.ContainerWaitResult)
@@ -206,6 +213,71 @@ func TestDockerExecFailure(t *testing.T) {
 	client.AssertExpectations(t)
 }
 // stdcopyFrame wraps payload in a single Docker multiplexed-stream frame, the
 // format StdCopy expects: an 8-byte header (stream type + 4-byte big-endian
 // length) followed by the payload.
 func stdcopyFrame(stream stdcopy.StdType, payload string) []byte {
 	b := make([]byte, 8+len(payload))
 	b[0] = byte(stream)
 	binary.BigEndian.PutUint32(b[4:8], uint32(len(payload)))
 	copy(b[8:], payload)
 	return b
 }
 // TestDockerAttachFlushesTrailingLine verifies that wait() blocks until the
 // attach() streaming goroutine has drained and flushed the container's output,
 // so a final line without a trailing newline is not lost.
 func TestDockerAttachFlushesTrailingLine(t *testing.T) {
 	ctx := context.Background()
 	framed := bytes.NewBuffer(stdcopyFrame(stdcopy.Stdout, "line one\nlast line without newline"))
 	var lines []string
 	logWriter := common.NewLineWriter(func(s string) bool {
 		lines = append(lines, s)
 		return true
 	})
 	client := &mockDockerClient{}
 	client.On("ContainerAttach", ctx, "123", mock.AnythingOfType("client.ContainerAttachOptions")).
 		Return(mobyclient.ContainerAttachResult{
 			HijackedResponse: mobyclient.HijackedResponse{
 				Conn:   &mockConn{},
 				Reader: bufio.NewReader(framed),
 			},
 		}, nil)
 	statusCh := make(chan container.WaitResponse, 1)
 	statusCh <- container.WaitResponse{StatusCode: 0}
 	errCh := make(chan error, 1)
 	client.On("ContainerWait", ctx, "123", mobyclient.ContainerWaitOptions{Condition: container.WaitConditionNotRunning}).
 		Return(mobyclient.ContainerWaitResult{
 			Result: (<-chan container.WaitResponse)(statusCh),
 			Error:  (<-chan error)(errCh),
 		})
 	cr := &containerReference{
 		id:  "123",
 		cli: client,
 		input: &NewContainerInput{
 			Image:  "image",
 			Stdout: logWriter,
 			Stderr: logWriter,
 		},
 	}
 	require.NoError(t, cr.attach()(ctx))
 	require.NoError(t, cr.wait()(ctx))
 	// wait() must have blocked until the goroutine drained AND flushed; the
 	// trailing, non-newline-terminated line must therefore be present. Reading
 	// lines here is race-free because wait() synchronizes on attachDone, which
 	// the goroutine closes after the final append.
 	assert.Equal(t, []string{"line one\n", "last line without newline"}, lines)
 	client.AssertExpectations(t)
 }
 func TestDockerWaitFailure(t *testing.T) {
 	ctx := context.Background()
--- a/act/container/docker_stub.go
+++ b/act/container/docker_stub.go
@@ -61,7 +61,7 @@ func NewDockerVolumeRemoveExecutor(volume string, force bool) common.Executor {
 	}
 }
-func NewDockerNetworkCreateExecutor(name string) common.Executor {
+func NewDockerNetworkCreateExecutor(name string, opts NewDockerNetworkCreateExecutorInput) common.Executor {
 	return func(ctx context.Context) error {
 		return nil
 	}
--- a/act/container/host_environment.go
+++ b/act/container/host_environment.go
@@ -323,15 +323,15 @@ func (e *HostEnvironment) exec(ctx context.Context, command []string, cmdline st
 	cmd.Dir = wd
 	cmd.SysProcAttr = getSysProcAttr(cmdline, false)
-	// On Windows a step often launches a process tree (a shell that starts a
+	// A step often launches a process tree (a shell that starts a child which
-	// child which spawns further GUI or background processes). The default
+	// spawns further background or GUI processes). The default context
-	// context cancellation only kills the direct child, leaving the rest of the
+	// cancellation only kills the direct child, leaving the rest of the tree
-	// tree running; and because the orphans inherit cmd's stdout/stderr pipe,
+	// running; and because the orphans inherit cmd's stdout/stderr pipe,
-	// cmd.Wait() would block forever, hanging the runner. Kill the whole tree
+	// cmd.Wait() would block forever, hanging the runner. Kill the whole tree on
-	// via a Job Object on cancellation, and bound the wait so a leftover pipe
+	// cancellation — via a Job Object on Windows and the process group on Unix
-	// writer can never hang Wait indefinitely.
+	// (see processKiller) — and bound the wait so a leftover pipe writer can
 	// never hang Wait indefinitely.
 	var killer atomic.Pointer[processKiller]
 	if runtime.GOOS == "windows" {
 	cmd.Cancel = func() error {
 		if k := killer.Load(); k != nil {
 			return k.Kill()
@@ -341,10 +341,10 @@ func (e *HostEnvironment) exec(ctx context.Context, command []string, cmdline st
 		}
 		return nil
 	}
-		// Once the step process has exited, give its I/O pipes at most this long
+	// Once the step process has exited, give its I/O pipes at most this long to
-		// to drain before Wait force-closes them and returns (Go's WaitDelay).
+	// drain before Wait force-closes them and returns (Go's WaitDelay). This
 	// also covers a step that backgrounds a process holding the pipe open.
 	cmd.WaitDelay = 10 * time.Second
 	}
 	var ppty *os.File
 	var tty *os.File
@@ -375,18 +375,17 @@ func (e *HostEnvironment) exec(ctx context.Context, command []string, cmdline st
 	if err := cmd.Start(); err != nil {
 		return err
 	}
-	if runtime.GOOS == "windows" {
+	// Capture the started process for tree-kill on cancellation: a Job Object on
-		// Assign the started process to a Job Object so cmd.Cancel can kill the
+	// Windows (children spawned afterwards are auto-included) and the process
-		// whole descendant tree. Children spawned afterwards are auto-included.
+	// group on Unix. On failure (e.g. Windows nested-job restrictions) we fall
-		// On failure (e.g. nested-job restrictions) we fall back to the default
+	// back to the default single-process kill; WaitDelay + end-of-job cleanup
-		// single-process kill; WaitDelay + end-of-job cleanup still apply.
+	// still apply.
 	if k, kerr := newProcessKiller(cmd.Process); kerr != nil {
 		common.Logger(ctx).Warnf("process tree kill setup failed, falling back to single-process kill: %v", kerr)
 	} else {
 		killer.Store(k)
 		defer k.Close()
 	}
 	}
 	err = cmd.Wait()
 	if err != nil {
 		var exitErr *exec.ExitError
@@ -429,6 +428,24 @@ func (e *HostEnvironment) UpdateFromEnv(srcPath string, env *map[string]string)
 	return parseEnvFile(e, srcPath, env)
 }
 // removeAll is the filesystem delete used by removeAllWithContext. A package
 // var so tests can substitute a blocking stub without patching os.RemoveAll.
 var removeAll = os.RemoveAll
 // removeAllWithContext runs removeAll in a goroutine and returns once it
 // finishes or ctx is cancelled. On cancellation the goroutine is left running —
 // a delete blocked inside a syscall cannot be interrupted (see runWithTimeout).
 func removeAllWithContext(ctx context.Context, path string) error {
 	done := make(chan error, 1)
 	go func() { done <- removeAll(path) }()
 	select {
 	case err := <-done:
 		return err
 	case <-ctx.Done():
 		return ctx.Err()
 	}
 }
 func removePathWithRetry(ctx context.Context, path string) error {
 	if path == "" {
 		return nil
@@ -448,10 +465,13 @@ func removePathWithRetry(ctx context.Context, path string) error {
 			case <-time.After(delay):
 			}
 		}
-		lastErr = os.RemoveAll(path)
+		lastErr = removeAllWithContext(ctx, path)
 		if lastErr == nil {
 			return nil
 		}
 		if errors.Is(lastErr, context.DeadlineExceeded) {
 			return lastErr
 		}
 	}
 	return lastErr
 }
@@ -533,23 +553,61 @@ func (e *HostEnvironment) terminateRunningProcesses(ctx context.Context) {
 	}
 }
 // hostCleanupTimeout bounds each filesystem-teardown phase of the host
 // environment so a single stalled delete cannot wedge the runner slot forever.
 // A var (not const) so tests can shrink it.
 var hostCleanupTimeout = 30 * time.Second
 // runWithTimeout runs fn in a goroutine and returns once it finishes or timeout
 // elapses, whichever comes first. On timeout the goroutine is left running — an
 // os.RemoveAll blocked inside a delete syscall (AV/EDR filter drivers, an
 // unresponsive network mount, a dying disk) cannot be interrupted — and
 // context.DeadlineExceeded is returned. Leaking the goroutine and the scratch
 // state it was deleting is strictly better than blocking the caller forever and
 // permanently losing the runner's capacity slot; the leaked scratch dir is
 // reclaimed later by the runner's idle stale-dir sweep.
 func runWithTimeout(fn func(), timeout time.Duration) error {
 	done := make(chan struct{})
 	go func() {
 		defer close(done)
 		fn()
 	}()
 	timer := time.NewTimer(timeout)
 	defer timer.Stop()
 	select {
 	case <-done:
 		return nil
 	case <-timer.C:
 		return context.DeadlineExceeded
 	}
 }
 func (e *HostEnvironment) Remove() common.Executor {
 	return func(ctx context.Context) error {
 		logger := common.Logger(ctx)
 		// Ensure any lingering child processes are ended before attempting
 		// to remove the workspace (Windows file locks otherwise prevent cleanup).
 		e.terminateRunningProcesses(ctx)
 		// Only removes per-job misc state. Must not remove the cache/toolcache root.
 		// Bound it: CleanUp is a caller-supplied, typically unbounded os.RemoveAll,
 		// and a delete stalled by a filesystem filter driver would otherwise hang
 		// the job forever at "Cleaning up container" and hold the capacity slot.
 		if e.CleanUp != nil {
-			e.CleanUp()
+			logger.Debugf("running host environment cleanup callback")
 			if err := runWithTimeout(e.CleanUp, hostCleanupTimeout); err != nil {
 				logger.Warnf("host environment cleanup did not finish within %s; continuing job completion, scratch state may be leaked and is reclaimed by the idle stale-dir sweep", hostCleanupTimeout)
 			} else {
 				logger.Debugf("host environment cleanup callback finished")
 			}
 		}
 		// Detach: a cancelled ctx would skip removePathWithRetry's retries,
 		// which absorb Windows file-handle release lag after the kill above.
-		rmCtx, rmCancel := context.WithTimeout(context.Background(), 30*time.Second)
+		rmCtx, rmCancel := context.WithTimeout(context.Background(), hostCleanupTimeout)
 		defer rmCancel()
 		logger := common.Logger(ctx)
 		var errs []error
 		if err := removePathWithRetry(rmCtx, e.Path); err != nil {
 			logger.Warnf("failed to remove host misc state %s: %v", e.Path, err)
@@ -561,8 +619,15 @@ func (e *HostEnvironment) Remove() common.Executor {
 				errs = append(errs, err)
 			}
 		}
 		for _, err := range errs {
 			if !errors.Is(err, context.DeadlineExceeded) {
 				return errors.Join(errs...)
 			}
 		}
 		// Bounded teardown timed out; warnings already logged above. Do not
 		// fail job completion — leaked scratch is reclaimed by the idle sweep.
 		return nil
 	}
 }
 func (e *HostEnvironment) ToContainerPath(path string) string {
--- a/act/container/host_environment_test.go
+++ b/act/container/host_environment_test.go
@@ -15,6 +15,7 @@ import (
 	"runtime"
 	"strings"
 	"testing"
 	"time"
 	"gitea.com/gitea/runner/act/common"
@@ -188,6 +189,118 @@ func TestHostEnvironmentRemoveCleansWorkdirWhenOwned(t *testing.T) {
 	assert.ErrorIs(t, err, os.ErrNotExist)
 }
 func TestRemoveAllWithContextDoesNotHangOnStuckDelete(t *testing.T) {
 	release := make(chan struct{})
 	stubDone := make(chan struct{})
 	orig := removeAll
 	removeAll = func(string) error {
 		defer close(stubDone)
 		<-release
 		return nil
 	}
 	// removeAllWithContext intentionally leaks the delete goroutine on timeout,
 	// and that goroutine still references removeAll. Unblock it and wait for it
 	// to return before restoring the var, so the restore can't race the read.
 	t.Cleanup(func() {
 		close(release)
 		<-stubDone
 		removeAll = orig
 	})
 	ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond)
 	defer cancel()
 	err := removeAllWithContext(ctx, t.TempDir())
 	require.ErrorIs(t, err, context.DeadlineExceeded)
 }
 // TestHostEnvironmentRemoveDoesNotHangOnStuckCleanUp guards against a stalled
 // CleanUp callback (e.g. an os.RemoveAll blocked by an AV/EDR filter driver or
 // an unresponsive mount) wedging the runner slot forever at "Cleaning up
 // container". Remove must time out the callback and complete job teardown.
 func TestHostEnvironmentRemoveDoesNotHangOnStuckCleanUp(t *testing.T) {
 	// Keep the suite fast: shrink the per-phase teardown timeout for this test.
 	orig := hostCleanupTimeout
 	hostCleanupTimeout = 100 * time.Millisecond
 	t.Cleanup(func() { hostCleanupTimeout = orig })
 	logger := logrus.New()
 	ctx := common.WithLogger(context.Background(), logrus.NewEntry(logger))
 	base := t.TempDir()
 	path := filepath.Join(base, "misc", "hostexecutor")
 	require.NoError(t, os.MkdirAll(path, 0o700))
 	release := make(chan struct{})
 	t.Cleanup(func() { close(release) }) // unblock the leaked goroutine at test end
 	e := &HostEnvironment{
 		Path: path,
 		CleanUp: func() {
 			<-release // simulate a delete syscall stuck indefinitely
 		},
 		StdOut: os.Stdout,
 	}
 	done := make(chan error, 1)
 	go func() { done <- e.Remove()(ctx) }()
 	select {
 	case err := <-done:
 		require.NoError(t, err)
 	case <-time.After(10 * time.Second):
 		t.Fatal("Remove() hung on a stuck CleanUp callback")
 	}
 }
 // TestHostEnvironmentRemoveDoesNotHangOnStuckPathRemoval guards against a
 // stalled os.RemoveAll on the misc/workspace paths (same AV/EDR wedge as
 // #1023) wedging job completion after the CleanUp callback has already timed
 // out or finished.
 func TestHostEnvironmentRemoveDoesNotHangOnStuckPathRemoval(t *testing.T) {
 	origTimeout := hostCleanupTimeout
 	hostCleanupTimeout = 100 * time.Millisecond
 	t.Cleanup(func() { hostCleanupTimeout = origTimeout })
 	release := make(chan struct{})
 	stubDone := make(chan struct{})
 	origRemoveAll := removeAll
 	removeAll = func(string) error {
 		defer close(stubDone)
 		<-release
 		return nil
 	}
 	// The stuck delete goroutine outlives the timed-out Remove and still reads
 	// removeAll; unblock it and wait before restoring to avoid a restore/read race.
 	t.Cleanup(func() {
 		close(release)
 		<-stubDone
 		removeAll = origRemoveAll
 	})
 	logger := logrus.New()
 	ctx := common.WithLogger(context.Background(), logrus.NewEntry(logger))
 	base := t.TempDir()
 	path := filepath.Join(base, "misc", "hostexecutor")
 	require.NoError(t, os.MkdirAll(path, 0o700))
 	e := &HostEnvironment{
 		Path:   path,
 		StdOut: os.Stdout,
 	}
 	done := make(chan error, 1)
 	go func() { done <- e.Remove()(ctx) }()
 	select {
 	case err := <-done:
 		require.NoError(t, err)
 	case <-time.After(10 * time.Second):
 		t.Fatal("Remove() hung on a stuck path removal")
 	}
 }
 func TestBuildWindowsWorkspaceKillScript(t *testing.T) {
 	t.Run("single dir", func(t *testing.T) {
 		s := buildWindowsWorkspaceKillScript([]string{`C:\workspace\job1`})
--- a/act/container/process_other.go
+++ b/act/container/process_other.go
@@ -1,19 +1,29 @@
 // Copyright 2026 The Gitea Authors. All rights reserved.
 // SPDX-License-Identifier: MIT
-//go:build !windows
+//go:build plan9
 package container
 import "os"
-// processKiller is a no-op on non-Windows platforms. The Job Object based
+// processKiller falls back to single-process termination on platforms without
-// tree-kill is only wired in on Windows (see exec()); elsewhere the default
+// a process-group / Job Object tree-kill. The Job Object (Windows) and process
-// exec.CommandContext cancellation and Setpgid handling apply.
+// group (Unix) based tree-kills live in process_windows.go / process_unix.go;
-type processKiller struct{}
+// here we just kill the direct child, matching the previous default behaviour.
 type processKiller struct {
 	p *os.Process
 }
-func newProcessKiller(_ *os.Process) (*processKiller, error) { return &processKiller{}, nil }
+func newProcessKiller(p *os.Process) (*processKiller, error) {
 	return &processKiller{p: p}, nil
 }
-func (k *processKiller) Kill() error { return nil }
+func (k *processKiller) Kill() error {
 	if k == nil || k.p == nil {
 		return nil
 	}
 	return k.p.Kill()
 }
 func (k *processKiller) Close() error { return nil }
--- a/act/container/process_unix.go
+++ b/act/container/process_unix.go
@@ -0,0 +1,56 @@
 // Copyright 2026 The Gitea Authors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //go:build !windows && !plan9
 package container
 import (
 	"errors"
 	"os"
 	"syscall"
 )
 // processKiller terminates a step process together with its whole process
 // group, which is the Unix counterpart of the Windows Job Object tree-kill.
 //
 // Background: a step often launches a process tree (a shell that starts a child
 // which in turn spawns further background processes). The default
 // exec.CommandContext cancellation only kills the direct child, so cancelling a
 // job left the rest of the tree running. Because those orphans inherited the
 // step's stdout/stderr pipe, cmd.Wait() also blocked forever and the runner
 // hung.
 //
 // Steps are started with Setpgid (or Setsid for the PTY path, see
 // getSysProcAttr), which makes the step process the leader of a new process
 // group whose ID equals its PID. Signalling the negative PID delivers to every
 // process still in that group, so we can tear down the whole tree atomically on
 // cancellation, which also closes the inherited pipe handles so cmd.Wait() can
 // return.
 type processKiller struct {
 	pgid int
 }
 // newProcessKiller captures the process group of p (an already-started
 // process). Because the step is launched with Setpgid/Setsid, p is a group
 // leader and its PGID equals its PID; children spawned afterwards stay in the
 // same group unless they explicitly create their own.
 func newProcessKiller(p *os.Process) (*processKiller, error) {
 	return &processKiller{pgid: p.Pid}, nil
 }
 // Kill sends SIGKILL to the entire process group (the step process and every
 // descendant that stayed in the group). A missing group (ESRCH) means the
 // processes already exited and is not treated as an error.
 func (k *processKiller) Kill() error {
 	if k == nil || k.pgid <= 0 {
 		return nil
 	}
 	if err := syscall.Kill(-k.pgid, syscall.SIGKILL); err != nil && !errors.Is(err, syscall.ESRCH) {
 		return err
 	}
 	return nil
 }
 // Close is a no-op on Unix; there is no job handle to release.
 func (k *processKiller) Close() error { return nil }
--- a/act/container/process_unix_test.go
+++ b/act/container/process_unix_test.go
@@ -0,0 +1,100 @@
 // Copyright 2026 The Gitea Authors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //go:build !windows && !plan9
 package container
 import (
 	"fmt"
 	"os"
 	"os/exec"
 	"path/filepath"
 	"strconv"
 	"strings"
 	"syscall"
 	"testing"
 	"time"
 	"github.com/stretchr/testify/require"
 )
 // processAlive reports whether pid refers to a still-running process. Signal 0
 // performs error checking without delivering a signal: a nil error (or EPERM)
 // means the process exists, ESRCH means it is gone.
 //
 // On Linux, zombie processes (state Z in /proc/<pid>/stat) appear alive to
 // kill(0) but have already terminated — their corpse lingers until the parent
 // calls wait(). In a Docker container the child may be reparented to a PID 1
 // that does not reap promptly, so we treat zombies as not alive.
 func processAlive(pid int) bool {
 	err := syscall.Kill(pid, 0)
 	if err != nil {
 		return false
 	}
 	// On Linux /proc is available; check whether the process is a zombie.
 	if b, readErr := os.ReadFile(fmt.Sprintf("/proc/%d/stat", pid)); readErr == nil {
 		// Format: "pid (comm) state ..." — state follows the closing ')' of the
 		// command name (which may itself contain spaces and parens).
 		rest := string(b)
 		if idx := strings.LastIndex(rest, ") "); idx >= 0 {
 			fields := strings.Fields(rest[idx+2:])
 			if len(fields) > 0 && fields[0] == "Z" {
 				return false // zombie: terminated but not yet reaped
 			}
 		}
 	}
 	return true
 }
 // TestProcessKillerKillsTree verifies that a process group captured by the
 // killer is terminated together with a child the step spawns afterwards. This
 // mirrors a step that launches a child which spawns further processes, where
 // cancelling the job must take down the whole tree, not just the direct child.
 func TestProcessKillerKillsTree(t *testing.T) {
 	dir := t.TempDir()
 	pidFile := filepath.Join(dir, "child.pid")
 	// Parent shell backgrounds a long-lived child (writing its PID to a file)
 	// and then sleeps. With job control off (non-interactive sh) the backgrounded
 	// child stays in the parent's process group, so the group kill must reach it.
 	script := fmt.Sprintf(`sleep 600 & echo $! > %q; sleep 600`, pidFile)
 	cmd := exec.Command("/bin/sh", "-c", script)
 	// Launch as its own process-group leader, exactly like a real step does (see
 	// getSysProcAttr), so the killer's PGID == the process PID.
 	cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
 	require.NoError(t, cmd.Start())
 	t.Cleanup(func() {
 		_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
 		_ = cmd.Wait()
 	})
 	killer, err := newProcessKiller(cmd.Process)
 	require.NoError(t, err)
 	defer killer.Close()
 	// Wait for the backgrounded child PID to be reported.
 	var childPID int
 	require.Eventually(t, func() bool {
 		b, e := os.ReadFile(pidFile)
 		if e != nil {
 			return false
 		}
 		s := strings.TrimSpace(string(b))
 		if s == "" {
 			return false
 		}
 		childPID, _ = strconv.Atoi(s)
 		return childPID > 0 && processAlive(childPID)
 	}, 20*time.Second, 100*time.Millisecond, "child process should start")
 	// Killing the group must terminate both the parent and the backgrounded child.
 	require.NoError(t, killer.Kill())
 	// Reap the parent so it does not linger as a zombie (which would still report
 	// as alive); SIGKILL makes Wait return promptly.
 	_ = cmd.Wait()
 	require.Eventually(t, func() bool {
 		return !processAlive(childPID)
 	}, 20*time.Second, 100*time.Millisecond, "backgrounded child should be terminated")
 }
--- a/act/runner/command.go
+++ b/act/runner/command.go
@@ -48,8 +48,11 @@ func (rc *RunContext) commandHandler(ctx context.Context) common.LineHandler {
 		if resumeCommand != "" && command != resumeCommand {
 			// There should not be any emojis in the log output for Gitea.
 			// The code in the switch statement is the same.
 			// Return true (not false) so the line still reaches the raw_output
 			// log handler; otherwise everything between ::stop-commands:: and
 			// its end token is silently dropped from the step log.
 			logger.Infof("%s", line)
-			return false
+			return true
 		}
 		arg = UnescapeCommandData(arg)
 		kvPairs = unescapeKvPairs(kvPairs)
--- a/act/runner/command_test.go
+++ b/act/runner/command_test.go
@@ -28,6 +28,29 @@ func TestSetEnv(t *testing.T) {
 	a.Equal("valz", rc.Env["x"])
 }
 func TestStopCommandsKeepsSuppressedLinesInLog(t *testing.T) {
 	a := assert.New(t)
 	ctx := context.Background()
 	rc := new(RunContext)
 	handler := rc.commandHandler(ctx)
 	// Stop command processing until the matching end token is seen.
 	a.True(handler("::stop-commands::my-end-token\n"))
 	// A command-shaped line while stopped must not be executed (env unchanged),
 	// but must still return true so it reaches the raw_output log handler and is
 	// not dropped from the step log.
 	a.True(handler("::set-env name=x::valz\n"))
 	a.NotContains(rc.Env, "x")
 	// The matching end token resumes command processing.
 	a.True(handler("::my-end-token::\n"))
 	// Commands are processed again after resuming.
 	a.True(handler("::set-env name=y::valy\n"))
 	a.Equal("valy", rc.Env["y"])
 }
 func TestSetOutput(t *testing.T) {
 	a := assert.New(t)
 	ctx := context.Background()
--- a/act/runner/job_executor.go
+++ b/act/runner/job_executor.go
@@ -462,6 +462,11 @@ func useStepLogger(rc *RunContext, stepModel *model.Step, stage stepStage, execu
 		oldout, olderr := rc.JobContainer.ReplaceLogWriter(logWriter, logWriter)
 		defer rc.JobContainer.ReplaceLogWriter(oldout, olderr)
 		// Flush any buffered, not-yet-newline-terminated trailing line once the
 		// step has finished, so the final line of the step's output is not lost
 		// when it is not newline-terminated.
 		defer common.FlushWriter(logWriter)
 		return executor(ctx)
 	}
 }
--- a/act/runner/run_context.go
+++ b/act/runner/run_context.go
@@ -471,7 +471,8 @@ func (rc *RunContext) startJobContainer() common.Executor {
 			rc.pullServicesImages(rc.Config.ForcePull),
 			rc.JobContainer.Pull(rc.Config.ForcePull),
 			rc.stopJobContainer(),
-			container.NewDockerNetworkCreateExecutor(networkName).IfBool(createAndDeleteNetwork),
+			container.NewDockerNetworkCreateExecutor(networkName, rc.Config.ContainerNetworkCreateOptions).
 				IfBool(createAndDeleteNetwork),
 			rc.startServiceContainers(networkName),
 			rc.JobContainer.Create(rc.Config.ContainerCapAdd, rc.Config.ContainerCapDrop),
 			rc.JobContainer.Start(false),
--- a/act/runner/runner.go
+++ b/act/runner/runner.go
@@ -15,6 +15,7 @@ import (
 	"time"
 	"gitea.com/gitea/runner/act/common"
 	"gitea.com/gitea/runner/act/container"
 	"gitea.com/gitea/runner/act/model"
 	docker_container "github.com/moby/moby/api/types/container"
@@ -68,6 +69,7 @@ type Config struct {
 	ReplaceGheActionTokenWithGithubCom string                                        // Token of private action repo on GitHub.
 	Matrix                             map[string]map[string]bool                    // Matrix config to run
 	ContainerNetworkMode               docker_container.NetworkMode                  // the network mode of job containers (the value of --network)
 	ContainerNetworkCreateOptions      container.NewDockerNetworkCreateExecutorInput // the default network create options
 	ActionCache                        ActionCache                                   // Use a custom ActionCache Implementation
 	PresetGitHubContext   *model.GithubContext         // the preset github context, overrides some fields like DefaultBranch, Env, Secrets etc.
--- a/go.mod
+++ b/go.mod
@@ -36,7 +36,7 @@ require (
 	github.com/stretchr/testify v1.11.1
 	github.com/timshannon/bolthold v0.0.0-20240314194003-30aac6950928
 	go.etcd.io/bbolt v1.4.3
-	go.yaml.in/yaml/v4 v4.0.0-rc.5
+	go.yaml.in/yaml/v4 v4.0.0-rc.3
 	golang.org/x/sys v0.46.0
 	golang.org/x/term v0.44.0
 	google.golang.org/protobuf v1.36.11
--- a/go.sum
+++ b/go.sum
@@ -232,8 +232,6 @@ go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc=
 go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg=
 go.yaml.in/yaml/v4 v4.0.0-rc.3 h1:3h1fjsh1CTAPjW7q/EMe+C8shx5d8ctzZTrLcs/j8Go=
 go.yaml.in/yaml/v4 v4.0.0-rc.3/go.mod h1:aZqd9kCMsGL7AuUv/m/PvWLdg5sjJsZ4oHDEnfPPfY0=
 go.yaml.in/yaml/v4 v4.0.0-rc.5 h1:JVliQq9EGOYaTgMi+k8BhUJyqcGk4ZqeuiN1Cirba9c=
 go.yaml.in/yaml/v4 v4.0.0-rc.5/go.mod h1:aZqd9kCMsGL7AuUv/m/PvWLdg5sjJsZ4oHDEnfPPfY0=
 golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4=
 golang.org/x/crypto v0.52.0 h1:RMs7fP2rXdep0CftQlK8Uf+kibLm7qkCcradZWYz988=
 golang.org/x/crypto v0.52.0/go.mod h1:1QgfPxDqh0T2M/elOJtp9RvuR95kVjir0e6/BvEmGbc=
--- a/internal/app/run/runner.go
+++ b/internal/app/run/runner.go
@@ -22,6 +22,7 @@ import (
 	"gitea.com/gitea/runner/act/artifactcache"
 	"gitea.com/gitea/runner/act/common"
 	"gitea.com/gitea/runner/act/container"
 	"gitea.com/gitea/runner/act/model"
 	"gitea.com/gitea/runner/act/runner"
 	"gitea.com/gitea/runner/internal/pkg/client"
@@ -33,7 +34,7 @@ import (
 	"connectrpc.com/connect"
 	runnerv1 "gitea.dev/actions-proto-go/runner/v1"
-	"github.com/moby/moby/api/types/container"
+	docker_container "github.com/moby/moby/api/types/container"
 	log "github.com/sirupsen/logrus"
 )
@@ -127,15 +128,22 @@ func (r *Runner) OnIdle(ctx context.Context) {
 	if !r.shouldRunIdleCleanup() {
 		return
 	}
 	// Bind-workdir mode: reclaim stale per-task workspace dirs (numeric task IDs).
 	if r.cfg.Container.BindWorkdir {
 		workdirParent := strings.TrimLeft(r.cfg.Container.WorkdirParent, "/")
 		workdirRoot := filepath.FromSlash("/" + workdirParent)
-	r.cleanupStaleTaskDirs(ctx, workdirRoot)
+		r.cleanupStaleDirs(ctx, workdirRoot, isTaskIDDir)
 	}
 	// Host mode: reclaim per-job scratch dirs left behind when HostEnvironment
 	// cleanup timed out (e.g. a delete stalled by an AV/EDR filter driver). They
 	// sit under the host workdir parent alongside the shared tool_cache, which
 	// the name match leaves untouched. No-op when no host-mode job ever ran.
 	if hostRoot := filepath.FromSlash(r.cfg.Host.WorkdirParent); hostRoot != "" {
 		r.cleanupStaleDirs(ctx, hostRoot, isHostScratchDir)
 	}
 }
 func (r *Runner) shouldRunIdleCleanup() bool {
 	if !r.cfg.Container.BindWorkdir {
 		return false
 	}
 	if r.cfg.Runner.WorkdirCleanupAge <= 0 || r.cfg.Runner.IdleCleanupInterval <= 0 {
 		return false
 	}
@@ -155,18 +163,52 @@ func (r *Runner) shouldRunIdleCleanup() bool {
 	}
 }
 // cleanupStaleTaskDirs reclaims stale bind-workdir per-task directories under
 // workdirRoot. Retained as a thin wrapper so existing callers and tests keep a
 // stable entry point.
 func (r *Runner) cleanupStaleTaskDirs(ctx context.Context, workdirRoot string) {
-	entries, err := os.ReadDir(workdirRoot)
+	r.cleanupStaleDirs(ctx, workdirRoot, isTaskIDDir)
 }
 // isTaskIDDir reports whether name is a per-task workspace dir (numeric task
 // ID). Any other directory is skipped to avoid deleting operator-managed data
 // under workdir_root.
 func isTaskIDDir(name string) bool {
 	_, err := strconv.ParseUint(name, 10, 64)
 	return err == nil
 }
 // isHostScratchDir reports whether name is a per-job host-mode scratch dir:
 // hex.EncodeToString of 8 random bytes, i.e. exactly 16 lowercase hex chars
 // (see startHostEnvironment in act/runner/run_context.go). The narrow match
 // leaves the sibling shared "tool_cache" dir and any operator data untouched.
 func isHostScratchDir(name string) bool {
 	if len(name) != 16 {
 		return false
 	}
 	for _, c := range name {
 		if (c < '0' || c > '9') && (c < 'a' || c > 'f') {
 			return false
 		}
 	}
 	return true
 }
 // cleanupStaleDirs removes immediate child directories of root that match and
 // whose mtime is older than WorkdirCleanupAge. It is a no-op when root does not
 // exist yet (the runner has never written there).
 func (r *Runner) cleanupStaleDirs(ctx context.Context, root string, match func(name string) bool) {
 	entries, err := os.ReadDir(root)
 	if err != nil {
 		if errors.Is(err, os.ErrNotExist) {
 			return
 		}
-		log.Warnf("failed to list task workspace root %s for stale cleanup: %v", workdirRoot, err)
+		log.Warnf("failed to list directory %s for stale cleanup: %v", root, err)
 		return
 	}
 	// A task may begin between shouldRunIdleCleanup's running-count check and
-	// the loop below. That is safe because new task dirs are created with the
+	// the loop below. That is safe because new dirs are created with the
 	// current mtime and therefore fall on the keep side of cutoff.
 	cutoff := r.now().Add(-r.cfg.Runner.WorkdirCleanupAge)
 	for _, entry := range entries {
@@ -176,25 +218,23 @@ func (r *Runner) cleanupStaleTaskDirs(ctx context.Context, workdirRoot string) {
 		if !entry.IsDir() {
 			continue
 		}
-		// Task workspaces are indexed by numeric task IDs; skip any other
+		if !match(entry.Name()) {
 		// directories to avoid deleting operator-managed data under workdir_root.
 		if _, err := strconv.ParseUint(entry.Name(), 10, 64); err != nil {
 			continue
 		}
 		info, err := entry.Info()
 		if err != nil {
-			log.Warnf("failed to stat task workspace %s: %v", filepath.Join(workdirRoot, entry.Name()), err)
+			log.Warnf("failed to stat %s: %v", filepath.Join(root, entry.Name()), err)
 			continue
 		}
 		if info.ModTime().After(cutoff) {
 			continue
 		}
-		taskDir := filepath.Join(workdirRoot, entry.Name())
+		dir := filepath.Join(root, entry.Name())
-		if err := os.RemoveAll(taskDir); err != nil {
+		if err := os.RemoveAll(dir); err != nil {
-			log.Warnf("failed to clean stale task workspace %s: %v", taskDir, err)
+			log.Warnf("failed to clean stale directory %s: %v", dir, err)
 			continue
 		}
-		log.Infof("cleaned stale task workspace %s", taskDir)
+		log.Infof("cleaned stale directory %s", dir)
 	}
 }
@@ -394,7 +434,11 @@ func (r *Runner) run(ctx context.Context, task *runnerv1.Task, reporter *report.
 		ContainerNamePrefix:  fmt.Sprintf("GITEA-ACTIONS-TASK-%d", task.Id),
 		ContainerMaxLifetime: maxLifetime,
 		CleanWorkdir:         true,
-		ContainerNetworkMode:  container.NetworkMode(r.cfg.Container.Network),
+		ContainerNetworkMode: docker_container.NetworkMode(r.cfg.Container.Network),
 		ContainerNetworkCreateOptions: container.NewDockerNetworkCreateExecutorInput{
 			EnableIPv4: r.cfg.Container.NetworkCreateOptions.EnableIPv4,
 			EnableIPv6: r.cfg.Container.NetworkCreateOptions.EnableIPv6,
 		},
 		ContainerOptions:      r.cfg.Container.Options,
 		ContainerDaemonSocket: r.cfg.Container.DockerHost,
 		Privileged:            r.cfg.Container.Privileged,
--- a/internal/app/run/runner_idle_cleanup_test.go
+++ b/internal/app/run/runner_idle_cleanup_test.go
@@ -52,6 +52,55 @@ func TestRunnerCleanupStaleTaskDirs(t *testing.T) {
 	assert.DirExists(t, alphaNumericTask)
 }
 // TestRunnerOnIdleCleansStaleHostScratchDirs covers the host-mode leak path:
 // a per-job scratch dir (16 hex chars) left behind by a timed-out cleanup must
 // be reclaimed, while the shared tool_cache and operator data are preserved.
 func TestRunnerOnIdleCleansStaleHostScratchDirs(t *testing.T) {
 	now := time.Date(2026, time.April, 29, 20, 0, 0, 0, time.UTC)
 	hostRoot := filepath.Join(t.TempDir(), "act")
 	require.NoError(t, os.MkdirAll(hostRoot, 0o700))
 	staleScratch := filepath.Join(hostRoot, "0123456789abcdef") // 16 hex
 	freshScratch := filepath.Join(hostRoot, "fedcba9876543210")
 	toolCache := filepath.Join(hostRoot, "tool_cache")
 	operatorData := filepath.Join(hostRoot, "keep-me")
 	for _, path := range []string{staleScratch, freshScratch, toolCache, operatorData} {
 		require.NoError(t, os.MkdirAll(path, 0o700))
 	}
 	require.NoError(t, os.Chtimes(staleScratch, now.Add(-48*time.Hour), now.Add(-48*time.Hour)))
 	require.NoError(t, os.Chtimes(freshScratch, now.Add(-10*time.Minute), now.Add(-10*time.Minute)))
 	require.NoError(t, os.Chtimes(toolCache, now.Add(-72*time.Hour), now.Add(-72*time.Hour)))
 	require.NoError(t, os.Chtimes(operatorData, now.Add(-72*time.Hour), now.Add(-72*time.Hour)))
 	r := &Runner{
 		cfg: &config.Config{
 			Host: config.Host{WorkdirParent: hostRoot},
 			Runner: config.Runner{
 				WorkdirCleanupAge:   24 * time.Hour,
 				IdleCleanupInterval: time.Minute,
 			},
 		},
 		now: func() time.Time { return now },
 	}
 	r.OnIdle(context.Background())
 	assert.NoDirExists(t, staleScratch) // stale scratch reclaimed
 	assert.DirExists(t, freshScratch)   // within cleanup age, kept
 	assert.DirExists(t, toolCache)      // shared cache, never a scratch match
 	assert.DirExists(t, operatorData)   // non-hex name, untouched
 }
 func TestIsHostScratchDir(t *testing.T) {
 	assert.True(t, isHostScratchDir("0123456789abcdef"))
 	assert.True(t, isHostScratchDir("ffffffffffffffff"))
 	assert.False(t, isHostScratchDir("tool_cache"))
 	assert.False(t, isHostScratchDir("0123456789ABCDEF"))  // hex.EncodeToString is lowercase
 	assert.False(t, isHostScratchDir("0123456789abcde"))   // 15 chars
 	assert.False(t, isHostScratchDir("0123456789abcdef0")) // 17 chars
 	assert.False(t, isHostScratchDir("123"))
 }
 func TestRunnerCleanupStaleTaskDirsMissingRoot(t *testing.T) {
 	r := &Runner{
 		cfg: &config.Config{
@@ -135,7 +184,10 @@ func TestRunnerShouldRunIdleCleanupSkipsWhenJobRunning(t *testing.T) {
 	assert.False(t, r.shouldRunIdleCleanup())
 }
-func TestRunnerShouldRunIdleCleanupSkipsWhenBindWorkdirDisabled(t *testing.T) {
+// Idle cleanup runs regardless of bind_workdir: host mode (bind_workdir off)
 // still leaves per-job scratch dirs that the sweep must reclaim.
 func TestRunnerShouldRunIdleCleanupRunsWithoutBindWorkdir(t *testing.T) {
 	now := time.Date(2026, time.April, 29, 20, 0, 0, 0, time.UTC)
 	r := &Runner{
 		cfg: &config.Config{
 			Runner: config.Runner{
@@ -143,10 +195,10 @@ func TestRunnerShouldRunIdleCleanupSkipsWhenBindWorkdirDisabled(t *testing.T) {
 				IdleCleanupInterval: time.Minute,
 			},
 		},
-		now: time.Now,
+		now: func() time.Time { return now },
 	}
-	assert.False(t, r.shouldRunIdleCleanup())
+	assert.True(t, r.shouldRunIdleCleanup())
 }
 func TestRunnerShouldRunIdleCleanupSkipsWhenDisabled(t *testing.T) {
--- a/internal/pkg/config/config.example.yaml
+++ b/internal/pkg/config/config.example.yaml
@@ -40,11 +40,12 @@ runner:
  # The runner uses exponential backoff when idle, increasing the interval up to this maximum.
  # Set to 0 or same as fetch_interval to disable backoff.
  fetch_interval_max: 5s
-  # While idle, remove stale bind-workdir task directories older than this duration.
+  # While idle, remove stale bind-workdir task directories and orphaned host-mode
-  # Setting either workdir_cleanup_age or idle_cleanup_interval to 0 (or any
+  # scratch directories (left behind when a host cleanup delete stalls) older than
-  # non-positive value) disables workdir cleanup entirely.
+  # this duration. Setting either workdir_cleanup_age or idle_cleanup_interval to 0
  # (or any non-positive value) disables stale-directory cleanup entirely.
  workdir_cleanup_age: 24h
-  # Cadence for the idle stale bind-workdir cleanup pass.
+  # Cadence for the idle stale-directory cleanup pass.
  idle_cleanup_interval: 10m
  # The base interval for periodic log flush to the Gitea instance.
  # Logs may be sent earlier if the buffer reaches log_report_batch_size
@@ -115,6 +116,13 @@ container:
  # If it's empty, runner will create a network automatically.
  # Deprecated: `network_mode` is still accepted for old configs; use `network` instead.
  network: ""
  # network_create_options only apply when `network` is left empty and the runner
  # auto-creates a per-job network that does not already exist. They have no effect
  # when a custom `network` name is set, because that network is used as-is and never
  # created by the runner. Omit the entire block to use Docker's defaults.
  network_create_options:
    enable_ipv4: true  # Omit to use Docker's default (IPv4 enabled). Set false to disable IPv4.
    enable_ipv6: false # Omit to use Docker's default (IPv6 disabled). Enabling it requires dockerd started with --ipv6.
  # Whether to use privileged mode or not when launching task containers (privileged mode is required for Docker-in-Docker).
  privileged: false
  # Any other options to be used when the container is started (e.g., --add-host=my.gitea.url:host-gateway).
--- a/internal/pkg/config/config.go
+++ b/internal/pkg/config/config.go
@@ -33,8 +33,8 @@ type Runner struct {
 	FetchTimeout        time.Duration     `yaml:"fetch_timeout"`          // FetchTimeout specifies the timeout duration for fetching resources.
 	FetchInterval       time.Duration     `yaml:"fetch_interval"`         // FetchInterval specifies the interval duration for fetching resources.
 	FetchIntervalMax    time.Duration     `yaml:"fetch_interval_max"`     // FetchIntervalMax specifies the maximum backoff interval when idle.
-	WorkdirCleanupAge   time.Duration     `yaml:"workdir_cleanup_age"`    // WorkdirCleanupAge removes stale bind-workdir task directories older than this duration during idle cleanup.
+	WorkdirCleanupAge   time.Duration     `yaml:"workdir_cleanup_age"`    // WorkdirCleanupAge removes stale bind-workdir task directories and orphaned host-mode scratch dirs older than this duration during idle cleanup.
-	IdleCleanupInterval time.Duration     `yaml:"idle_cleanup_interval"`  // IdleCleanupInterval runs stale bind-workdir cleanup periodically while the runner is idle. Set to 0 to disable cleanup cadence.
+	IdleCleanupInterval time.Duration     `yaml:"idle_cleanup_interval"`  // IdleCleanupInterval runs stale-directory cleanup periodically while the runner is idle. Set to 0 to disable cleanup cadence.
 	LogReportInterval   time.Duration     `yaml:"log_report_interval"`    // LogReportInterval specifies the base interval for periodic log flush.
 	LogReportMaxLatency time.Duration     `yaml:"log_report_max_latency"` // LogReportMaxLatency specifies the max time a log row can wait before being sent.
 	LogReportBatchSize  int               `yaml:"log_report_batch_size"`  // LogReportBatchSize triggers immediate log flush when buffer reaches this size.
@@ -59,6 +59,7 @@ type Cache struct {
 // Container represents the configuration for the container.
 type Container struct {
 	Network              string                        `yaml:"network"`                // Network specifies the network for the container.
 	NetworkCreateOptions ContainerNetworkCreateOptions `yaml:"network_create_options"` // Add options when the network need to be created by the runner
 	NetworkMode          string                        `yaml:"network_mode"`           // Deprecated: use Network instead. Could be removed after Gitea 1.20
 	Privileged           bool                          `yaml:"privileged"`             // Privileged indicates whether the container runs in privileged mode.
 	Options              string                        `yaml:"options"`                // Options specifies additional options for the container.
@@ -72,6 +73,11 @@ type Container struct {
 	BindWorkdir          bool                          `yaml:"bind_workdir"`           // BindWorkdir binds the workspace to the host filesystem instead of using Docker volumes. Required for DinD when jobs use docker compose with bind mounts.
 }
 type ContainerNetworkCreateOptions struct {
 	EnableIPv4 *bool `yaml:"enable_ipv4"` // Enable or disable IPv4 for the network (true for docker by default)
 	EnableIPv6 *bool `yaml:"enable_ipv6"` // Enable or disable IPv6 for the network (false for docker by default)
 }
 // Host represents the configuration for the host.
 type Host struct {
 	WorkdirParent string `yaml:"workdir_parent"` // WorkdirParent specifies the parent directory for the host's working directory.
--- a/internal/pkg/config/config_test.go
+++ b/internal/pkg/config/config_test.go
@@ -117,3 +117,50 @@ func TestLoadDefault_MalformedYAMLReturnsParseError(t *testing.T) {
 	assert.Contains(t, err.Error(), "parse config file")
 	assert.NotContains(t, err.Error(), "defaults metadata")
 }
 func TestContainerNetworkCreateOptions(t *testing.T) {
 	// Verify that the enable_ipv4/enable_ipv6 YAML keys unmarshal into the *bool fields,
 	// distinguishing an explicit true/false from an omitted key (nil). A nil here is
 	// forwarded as-is to Docker, which applies its own default.
 	loadOptions := func(t *testing.T, yaml string) ContainerNetworkCreateOptions {
 		t.Helper()
 		dir := t.TempDir()
 		path := filepath.Join(dir, "config.yaml")
 		require.NoError(t, os.WriteFile(path, []byte(yaml), 0o600))
 		cfg, err := LoadDefault(path)
 		require.NoError(t, err)
 		return cfg.Container.NetworkCreateOptions
 	}
 	t.Run("enable_ipv6 true unmarshals to non-nil true", func(t *testing.T) {
 		opts := loadOptions(t, "container:\n  network_create_options:\n    enable_ipv6: true\n")
 		require.NotNil(t, opts.EnableIPv6)
 		assert.True(t, *opts.EnableIPv6)
 	})
 	t.Run("enable_ipv6 false unmarshals to non-nil false", func(t *testing.T) {
 		opts := loadOptions(t, "container:\n  network_create_options:\n    enable_ipv6: false\n")
 		require.NotNil(t, opts.EnableIPv6)
 		assert.False(t, *opts.EnableIPv6)
 	})
 	t.Run("enable_ipv4 false unmarshals to non-nil false", func(t *testing.T) {
 		opts := loadOptions(t, "container:\n  network_create_options:\n    enable_ipv4: false\n")
 		require.NotNil(t, opts.EnableIPv4)
 		assert.False(t, *opts.EnableIPv4)
 	})
 	t.Run("omitted keys stay nil", func(t *testing.T) {
 		opts := loadOptions(t, "container:\n  network_create_options:\n    enable_ipv4: true\n")
 		require.NotNil(t, opts.EnableIPv4)
 		assert.True(t, *opts.EnableIPv4)
 		assert.Nil(t, opts.EnableIPv6, "an omitted enable_ipv6 must remain nil so Docker's default applies")
 	})
 	t.Run("omitted block leaves both nil", func(t *testing.T) {
 		opts := loadOptions(t, "container:\n  network: \"\"\n")
 		assert.Nil(t, opts.EnableIPv4)
 		assert.Nil(t, opts.EnableIPv6)
 	})
 }
Author	SHA1	Message	Date
StarAurryon	2963716953	feat: ipv6 options for network container creation (#1029 ) Here is a final proposal for ipv6 enablement on temporary network created by gitea runner --------- Co-authored-by: Nicolas <bircni@icloud.com> Co-authored-by: Nicolas Schwartz <9308314+StarAurryon@users.noreply.github.com> Reviewed-on: https://gitea.com/gitea/runner/pulls/1029 Reviewed-by: Nicolas <bircni@icloud.com> Co-authored-by: StarAurryon <206206+staraurryon@noreply.gitea.com> Co-committed-by: StarAurryon <206206+staraurryon@noreply.gitea.com>	2026-06-15 05:05:20 +00:00
Nicolas	3996d6d032	fix(cleanup): kill Unix step process group on cancel to avoid hang (#1025 ) Cancelling a job on a Linux/macOS host runner can leave the spawned process tree running and hang the runner — the same failure mode fixed for Windows in #1011, just on the other platforms. Steps are launched as process-group leaders (`Setpgid`, or `Setsid` for the PTY path), but the default `exec.CommandContext` cancellation only kills the direct child. When a step launches a shell that starts a child which in turn spawns further background processes, cancelling the job leaves the descendants running. Because those orphans inherited the step's stdout/stderr pipe, the read end never hits EOF and `cmd.Wait()` blocks forever. Because the step executor never returns: - the orphaned processes keep running (the cancelled work is not actually stopped), and - end-of-job cleanup is never reached, so the runner appears to go offline / stop picking up jobs. ## Fix Apply the same tree-kill approach as Windows, using the Unix counterpart of a Job Object: the process group. - Add a Unix `processKiller` (`process_unix.go`) that captures the step's PGID (== PID, since the step is launched as a group leader) and sends `SIGKILL` to the whole group on cancellation. This also closes the inherited pipe handles so `cmd.Wait()` can return. `ESRCH` (group already gone) is not treated as an error. - Restrict the previous no-op stub (`process_other.go`) to `plan9` and have it fall back to a single-process kill, preserving plan9's prior behaviour. - Wire `cmd.Cancel` (tree kill) and `cmd.WaitDelay` (10s) unconditionally in `exec()` instead of Windows-only. `WaitDelay` also covers a step that backgrounds a process holding the pipe open after the main process exits. Reviewed-on: https://gitea.com/gitea/runner/pulls/1025 Reviewed-by: Zettat123 <39446+zettat123@noreply.gitea.com>	2026-06-14 20:52:42 +00:00
Nicolas	205af7cd01	fix: prevent loss of step log output at end of step (#1028 ) ## Problem Several runner code paths could drop the tail of a step's log output, so a failing (or cancelled) step would show output that is missing its last line(s). This was observed in practice and traced to four independent issues. ## Root causes & fixes ### 1. Trailing line without a newline was never flushed `common.lineWriter` buffers output until it sees a `\n`. A final line without a trailing newline (e.g. an error message printed right before a process exits, a panic, `printf` without `\n`) stayed in the internal buffer and was never emitted — the writer exposed no flush at all. - Added `lineWriter.Flush()` (idempotent), a `Flusher` interface, and a `FlushWriter(io.Writer)` helper. - Flush at every stream EOF: the exec copy goroutine, the container `attach()` streaming goroutine, and at step end (`useStepLogger`). ### 2. Cancellation/timeout truncated output `waitForCommand` returned immediately on `ctx.Done()` and abandoned the output-copy goroutine, losing output the command had already produced. It now drains with a bounded grace period before returning. The response channel is buffered so the goroutine can't leak if the drain times out. ### 3. `attach()` raced the final bytes Container output was streamed in a fire-and-forget goroutine that `wait()` did not synchronize with, so the step could proceed before the last bytes were written. `wait()` now blocks on the streaming goroutine (bounded) so output is fully drained and flushed first. ### 4. `::stop-commands::` silently dropped lines from the step log Lines between `::stop-commands::<token>` and its end token were echoed without the `raw_output` field and short-circuited the handler chain (`return false`), so they never reached the step log (non-raw entries aren't appended while a step is running). Now returns `true` so they are still captured. Reviewed-on: https://gitea.com/gitea/runner/pulls/1028 Reviewed-by: Zettat123 <39446+zettat123@noreply.gitea.com>	2026-06-14 20:43:19 +00:00
Nicolas	33e6d1d8ff	fix(host): bound host-environment cleanup and reclaim leaked scratch dirs (#1024 ) Fixes #1023. ## Problem In Windows host mode, a single stalled delete syscall (AV/EDR filter driver, unresponsive mount, dying disk) wedged the job forever at `Cleaning up container`. `HostEnvironment.Remove()` bounds every teardown phase (`terminateRunningProcesses`, both `removePathWithRetry` calls) except the `CleanUp` callback — an unbounded `os.RemoveAll(miscpath)` assigned in `startHostEnvironment`. The runner then held its capacity slot indefinitely, the task was reaped as a zombie, and there were no diagnostics. ## Fix - Bound the cleanup (availability): `Remove()` now runs `CleanUp` under `hostCleanupTimeout` (30s) via `runWithTimeout`; on timeout it logs a warning and continues job completion. The stuck goroutine is left to finish (a delete syscall can't be interrupted). Added debug logs around the phase. - Reclaim the leak (disk hygiene): a timed-out cleanup can leave a scratch dir behind, so the existing idle stale-dir sweep is extended to also remove orphaned host-mode scratch dirs (16-hex names) under `Host.WorkdirParent`, leaving the shared `tool_cache` and operator data untouched. The `bind_workdir` gate is dropped from `shouldRunIdleCleanup` so host-mode runners run the sweep. Reviewed-on: https://gitea.com/gitea/runner/pulls/1024 Reviewed-by: Lunny Xiao <xiaolunwen@gmail.com>	2026-06-14 14:14:43 +00:00