diff --git a/.gitea/workflows/release-tag.yml b/.gitea/workflows/release-tag.yml index 4a2bb3bc..2005e711 100644 --- a/.gitea/workflows/release-tag.yml +++ b/.gitea/workflows/release-tag.yml @@ -17,7 +17,7 @@ jobs: go-version-file: "go.mod" - name: Import GPG key id: import_gpg - uses: crazy-max/ghaction-import-gpg@v6 + uses: crazy-max/ghaction-import-gpg@v7 with: gpg_private_key: ${{ secrets.GPG_PRIVATE_KEY }} passphrase: ${{ secrets.PASSPHRASE }} diff --git a/README.md b/README.md index f159e44b..975bc28e 100644 --- a/README.md +++ b/README.md @@ -132,6 +132,12 @@ Besides `GITEA_INSTANCE_URL` and `GITEA_RUNNER_REGISTRATION_TOKEN`, the image en For a fuller container-oriented walkthrough, see [examples/docker](examples/docker/README.md). +When `container.bind_workdir` is enabled, stale task workspace directories can be cleaned while the runner is idle: +- directories older than `runner.workdir_cleanup_age` are removed (default: `24h`; set `0` to disable) +- cleanup runs every `runner.idle_cleanup_interval` (default: `10m`; set `0` to disable) +- only purely numeric subdirectories under `container.workdir_parent` are treated as task workspaces and may be removed +- cleanup assumes `container.workdir_parent` is not shared across multiple runners + ### Example Deployments Check out the [examples](examples) directory for sample deployment types. diff --git a/act/container/container_types.go b/act/container/container_types.go index a1b2f01c..16b9ed99 100644 --- a/act/container/container_types.go +++ b/act/container/container_types.go @@ -6,6 +6,7 @@ package container import ( "context" + "fmt" "io" "gitea.com/gitea/runner/act/common" @@ -13,6 +14,13 @@ import ( "github.com/docker/go-connections/nat" ) +// ExitCodeError reports a non-zero process exit code from a container command. +type ExitCodeError int + +func (e ExitCodeError) Error() string { + return fmt.Sprintf("Process completed with exit code %d.", int(e)) +} + // NewContainerInput the input for the New function type NewContainerInput struct { Image string diff --git a/act/container/docker_run.go b/act/container/docker_run.go index bb9d8fb6..3aa891dc 100644 --- a/act/container/docker_run.go +++ b/act/container/docker_run.go @@ -633,14 +633,10 @@ func (cr *containerReference) exec(cmd []string, env map[string]string, user, wo return fmt.Errorf("failed to inspect exec: %w", err) } - switch inspectResp.ExitCode { - case 0: + if inspectResp.ExitCode == 0 { return nil - case 127: - return fmt.Errorf("exitcode '%d': command not found, please refer to https://github.com/nektos/act/issues/107 for more information", inspectResp.ExitCode) - default: - return fmt.Errorf("exitcode '%d': failure", inspectResp.ExitCode) } + return ExitCodeError(inspectResp.ExitCode) } } @@ -930,7 +926,7 @@ func (cr *containerReference) wait() common.Executor { return nil } - return fmt.Errorf("exit with `FAILURE`: %v", statusCode) + return ExitCodeError(statusCode) } } diff --git a/act/container/docker_run_test.go b/act/container/docker_run_test.go index beae70ca..9194d8c4 100644 --- a/act/container/docker_run_test.go +++ b/act/container/docker_run_test.go @@ -23,6 +23,7 @@ import ( "github.com/sirupsen/logrus/hooks/test" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/mock" + "github.com/stretchr/testify/require" ) func TestDocker(t *testing.T) { @@ -85,6 +86,11 @@ func (m *mockDockerClient) ContainerExecInspect(ctx context.Context, execID stri return args.Get(0).(types.ContainerExecInspect), args.Error(1) } +func (m *mockDockerClient) ContainerWait(ctx context.Context, containerID string, condition container.WaitCondition) (<-chan container.WaitResponse, <-chan error) { + args := m.Called(ctx, containerID, condition) + return args.Get(0).(<-chan container.WaitResponse), args.Get(1).(<-chan error) +} + func (m *mockDockerClient) CopyToContainer(ctx context.Context, id, path string, content io.Reader, options types.CopyToContainerOptions) error { args := m.Called(ctx, id, path, content, options) return args.Error(0) @@ -174,12 +180,43 @@ func TestDockerExecFailure(t *testing.T) { } err := cr.exec([]string{""}, map[string]string{}, "user", "workdir")(ctx) - assert.Error(t, err, "exit with `FAILURE`: 1") //nolint:testifylint // pre-existing issue from nektos/act + var exitErr ExitCodeError + require.ErrorAs(t, err, &exitErr) + assert.Equal(t, ExitCodeError(1), exitErr) + assert.Equal(t, "Process completed with exit code 1.", err.Error()) conn.AssertExpectations(t) client.AssertExpectations(t) } +func TestDockerWaitFailure(t *testing.T) { + ctx := context.Background() + + statusCh := make(chan container.WaitResponse, 1) + statusCh <- container.WaitResponse{StatusCode: 2} + errCh := make(chan error, 1) + + client := &mockDockerClient{} + client.On("ContainerWait", ctx, "123", container.WaitConditionNotRunning). + Return((<-chan container.WaitResponse)(statusCh), (<-chan error)(errCh)) + + cr := &containerReference{ + id: "123", + cli: client, + input: &NewContainerInput{ + Image: "image", + }, + } + + err := cr.wait()(ctx) + var exitErr ExitCodeError + require.ErrorAs(t, err, &exitErr) + assert.Equal(t, ExitCodeError(2), exitErr) + assert.Equal(t, "Process completed with exit code 2.", err.Error()) + + client.AssertExpectations(t) +} + func TestDockerCopyTarStream(t *testing.T) { ctx := context.Background() diff --git a/act/container/host_environment.go b/act/container/host_environment.go index 538bafeb..121266b8 100644 --- a/act/container/host_environment.go +++ b/act/container/host_environment.go @@ -16,7 +16,9 @@ import ( "os/exec" "path/filepath" "runtime" + "strconv" "strings" + "sync" "time" "gitea.com/gitea/runner/act/common" @@ -34,9 +36,15 @@ type HostEnvironment struct { TmpDir string ToolCache string Workdir string - ActPath string - CleanUp func() - StdOut io.Writer + // BindWorkdir is true when the app runner mounts the workspace on the host and + // deletes the task directory after the job; host teardown must not remove Workdir. + BindWorkdir bool + ActPath string + CleanUp func() + StdOut io.Writer + + mu sync.Mutex + runningPIDs map[int]struct{} } func (e *HostEnvironment) Create(_, _ []string) common.Executor { @@ -344,8 +352,30 @@ func (e *HostEnvironment) exec(ctx context.Context, command []string, cmdline st if ppty != nil { go writeKeepAlive(ppty) } - err = cmd.Run() + // Split Start/Wait so the PID can be registered before the process can exit; + // cmd.Run() would block until exit, by which time the PID may have been reused. + if err := cmd.Start(); err != nil { + return err + } + if cmd.Process != nil { + e.mu.Lock() + if e.runningPIDs == nil { + e.runningPIDs = map[int]struct{}{} + } + e.runningPIDs[cmd.Process.Pid] = struct{}{} + e.mu.Unlock() + defer func(pid int) { + e.mu.Lock() + delete(e.runningPIDs, pid) + e.mu.Unlock() + }(cmd.Process.Pid) + } + err = cmd.Wait() if err != nil { + var exitErr *exec.ExitError + if errors.As(err, &exitErr) { + return ExitCodeError(exitErr.ExitCode()) + } return err } if tty != nil { @@ -385,12 +415,83 @@ func (e *HostEnvironment) UpdateFromEnv(srcPath string, env *map[string]string) return parseEnvFile(e, srcPath, env) } +func removePathWithRetry(ctx context.Context, path string) error { + if path == "" { + return nil + } + attempts := 1 + delay := time.Duration(0) + if runtime.GOOS == "windows" { + attempts = 5 + delay = 200 * time.Millisecond + } + var lastErr error + for i := 0; i < attempts; i++ { + if i > 0 { + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(delay): + } + } + lastErr = os.RemoveAll(path) + if lastErr == nil { + return nil + } + } + return lastErr +} + +func (e *HostEnvironment) terminateRunningProcesses(ctx context.Context) { + if runtime.GOOS != "windows" { + return + } + e.mu.Lock() + pids := make([]int, 0, len(e.runningPIDs)) + for pid := range e.runningPIDs { + pids = append(pids, pid) + } + e.mu.Unlock() + + if len(pids) == 0 { + return + } + + logger := common.Logger(ctx) + for _, pid := range pids { + // Best-effort: forcibly terminate process tree to release file handles + // so that workspace cleanup can succeed on Windows. + cmd := exec.CommandContext(ctx, "taskkill", "/PID", strconv.Itoa(pid), "/T", "/F") + out, err := cmd.CombinedOutput() + if err != nil { + logger.Debugf("taskkill failed for pid=%d: %v output=%s", pid, err, strings.TrimSpace(string(out))) + } + } +} + func (e *HostEnvironment) Remove() common.Executor { return func(ctx context.Context) error { + // Ensure any lingering child processes are ended before attempting + // to remove the workspace (Windows file locks otherwise prevent cleanup). + e.terminateRunningProcesses(ctx) + + // Only removes per-job misc state. Must not remove the cache/toolcache root. if e.CleanUp != nil { e.CleanUp() } - return os.RemoveAll(e.Path) + logger := common.Logger(ctx) + var errs []error + if err := removePathWithRetry(ctx, e.Path); err != nil { + logger.Warnf("failed to remove host misc state %s: %v", e.Path, err) + errs = append(errs, err) + } + if !e.BindWorkdir && e.Workdir != "" { + if err := removePathWithRetry(ctx, e.Workdir); err != nil { + logger.Warnf("failed to remove host workspace %s: %v", e.Workdir, err) + errs = append(errs, err) + } + } + return errors.Join(errs...) } } diff --git a/act/container/host_environment_test.go b/act/container/host_environment_test.go index 040c0642..4101e8ba 100644 --- a/act/container/host_environment_test.go +++ b/act/container/host_environment_test.go @@ -11,9 +11,14 @@ import ( "os" "path" "path/filepath" + "runtime" "testing" + "gitea.com/gitea/runner/act/common" + + "github.com/sirupsen/logrus" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" ) // Type assert HostEnvironment implements ExecutionsEnvironment @@ -69,3 +74,76 @@ func TestGetContainerArchive(t *testing.T) { _, err = reader.Next() assert.ErrorIs(t, err, io.EOF) } + +func TestHostEnvironmentExecExitCode(t *testing.T) { + if runtime.GOOS == "windows" { + t.Skip("uses POSIX shell") + } + dir := t.TempDir() + ctx := context.Background() + e := &HostEnvironment{ + Path: filepath.Join(dir, "path"), + TmpDir: filepath.Join(dir, "tmp"), + ToolCache: filepath.Join(dir, "tool_cache"), + ActPath: filepath.Join(dir, "act_path"), + StdOut: io.Discard, + Workdir: filepath.Join(dir, "path"), + } + for _, p := range []string{e.Path, e.TmpDir, e.ToolCache, e.ActPath} { + assert.NoError(t, os.MkdirAll(p, 0o700)) //nolint:testifylint // test setup + } + + err := e.Exec([]string{"sh", "-c", "exit 3"}, map[string]string{"PATH": os.Getenv("PATH")}, "", "")(ctx) + var exitErr ExitCodeError + require.ErrorAs(t, err, &exitErr) + assert.Equal(t, ExitCodeError(3), exitErr) + assert.Equal(t, "Process completed with exit code 3.", err.Error()) +} + +func TestHostEnvironmentRemoveCleansWorkdir(t *testing.T) { + logger := logrus.New() + ctx := common.WithLogger(context.Background(), logrus.NewEntry(logger)) + base := t.TempDir() + miscRoot := filepath.Join(base, "misc") + path := filepath.Join(miscRoot, "hostexecutor") + require.NoError(t, os.MkdirAll(path, 0o700)) + workdir := filepath.Join(base, "workspace", "owner", "repo") + require.NoError(t, os.MkdirAll(workdir, 0o700)) + + e := &HostEnvironment{ + Path: path, + Workdir: workdir, + BindWorkdir: false, + CleanUp: func() { + _ = os.RemoveAll(miscRoot) + }, + StdOut: os.Stdout, + } + require.NoError(t, e.Remove()(ctx)) + _, err := os.Stat(workdir) + assert.ErrorIs(t, err, os.ErrNotExist) +} + +func TestHostEnvironmentRemoveSkipsWorkdirWhenBindWorkdir(t *testing.T) { + logger := logrus.New() + ctx := common.WithLogger(context.Background(), logrus.NewEntry(logger)) + base := t.TempDir() + miscRoot := filepath.Join(base, "misc") + path := filepath.Join(miscRoot, "hostexecutor") + require.NoError(t, os.MkdirAll(path, 0o700)) + workdir := filepath.Join(base, "workspace", "123", "owner", "repo") + require.NoError(t, os.MkdirAll(workdir, 0o700)) + + e := &HostEnvironment{ + Path: path, + Workdir: workdir, + BindWorkdir: true, + CleanUp: func() { + _ = os.RemoveAll(miscRoot) + }, + StdOut: os.Stdout, + } + require.NoError(t, e.Remove()(ctx)) + _, err := os.Stat(workdir) + require.NoError(t, err) +} diff --git a/act/runner/job_executor.go b/act/runner/job_executor.go index 2001dea4..ed9f7062 100644 --- a/act/runner/job_executor.go +++ b/act/runner/job_executor.go @@ -24,6 +24,13 @@ type jobInfo interface { result(result string) } +// reportStepError emits the GitHub Actions ##[error] annotation and records +// the error against the job so the job is reported as failed. +func reportStepError(ctx context.Context, err error) { + common.Logger(ctx).Errorf("##[error]%v", err) + common.SetJobError(ctx, err) +} + func newJobExecutor(info jobInfo, sf stepFactory, rc *RunContext) common.Executor { steps := make([]common.Executor, 0) preSteps := make([]common.Executor, 0) @@ -32,7 +39,7 @@ func newJobExecutor(info jobInfo, sf stepFactory, rc *RunContext) common.Executo steps = append(steps, func(ctx context.Context) error { logger := common.Logger(ctx) if len(info.matrix()) > 0 { - logger.Infof("\U0001F9EA Matrix: %v", info.matrix()) + logger.Infof("Matrix: %v", info.matrix()) } return nil }) @@ -75,33 +82,36 @@ func newJobExecutor(info jobInfo, sf stepFactory, rc *RunContext) common.Executo preExec := step.pre() preSteps = append(preSteps, useStepLogger(rc, stepModel, stepStagePre, func(ctx context.Context) error { - logger := common.Logger(ctx) preErr := preExec(ctx) if preErr != nil { - logger.Errorf("%v", preErr) - common.SetJobError(ctx, preErr) + reportStepError(ctx, preErr) } else if ctx.Err() != nil { - logger.Errorf("%v", ctx.Err()) - common.SetJobError(ctx, ctx.Err()) + reportStepError(ctx, ctx.Err()) } return preErr })) stepExec := step.main() steps = append(steps, useStepLogger(rc, stepModel, stepStageMain, func(ctx context.Context) error { - logger := common.Logger(ctx) err := stepExec(ctx) if err != nil { - logger.Errorf("%v", err) - common.SetJobError(ctx, err) + reportStepError(ctx, err) } else if ctx.Err() != nil { - logger.Errorf("%v", ctx.Err()) - common.SetJobError(ctx, ctx.Err()) + reportStepError(ctx, ctx.Err()) } return nil })) - postExec := useStepLogger(rc, stepModel, stepStagePost, step.post()) + postFn := step.post() + postExec := useStepLogger(rc, stepModel, stepStagePost, func(ctx context.Context) error { + err := postFn(ctx) + if err != nil { + reportStepError(ctx, err) + } else if ctx.Err() != nil { + reportStepError(ctx, ctx.Err()) + } + return err + }) if postExecutor != nil { // run the post executor in reverse order postExecutor = postExec.Finally(postExecutor) @@ -196,7 +206,7 @@ func setJobResult(ctx context.Context, info jobInfo, rc *RunContext, success boo jobResultMessage = "failed" } - logger.WithField("jobResult", jobResult).Infof("\U0001F3C1 Job %s", jobResultMessage) + logger.WithField("jobResult", jobResult).Infof("Job %s", jobResultMessage) } func setJobOutputs(ctx context.Context, rc *RunContext) { diff --git a/act/runner/run_context.go b/act/runner/run_context.go index 1fee3234..9a82ebe3 100644 --- a/act/runner/run_context.go +++ b/act/runner/run_context.go @@ -220,11 +220,12 @@ func (rc *RunContext) startHostEnvironment() common.Executor { } toolCache := filepath.Join(cacheDir, "tool_cache") rc.JobContainer = &container.HostEnvironment{ - Path: path, - TmpDir: runnerTmp, - ToolCache: toolCache, - Workdir: rc.Config.Workdir, - ActPath: actPath, + Path: path, + TmpDir: runnerTmp, + ToolCache: toolCache, + Workdir: rc.Config.Workdir, + BindWorkdir: rc.Config.BindWorkdir, + ActPath: actPath, CleanUp: func() { os.RemoveAll(miscpath) }, @@ -729,7 +730,7 @@ func (rc *RunContext) isEnabled(ctx context.Context) (bool, error) { jobType, jobTypeErr := job.Type() if runJobErr != nil { - return false, fmt.Errorf(" \u274C Error in if-expression: \"if: %s\" (%s)", job.If.Value, runJobErr) + return false, fmt.Errorf("if-expression %q evaluation failed: %s", job.If.Value, runJobErr) } if jobType == model.JobTypeInvalid { diff --git a/act/runner/step.go b/act/runner/step.go index d4ee50ac..8a241acf 100644 --- a/act/runner/step.go +++ b/act/runner/step.go @@ -107,7 +107,7 @@ func runStepExecutor(step step, stage stepStage, executor common.Executor) commo if strings.Contains(stepString, "::add-mask::") { stepString = "add-mask command" } - logger.Infof("\u2B50 Run %s %s", stage, stepString) + logger.Infof("Run %s %s", stage, stepString) // Prepare and clean Runner File Commands actPath := rc.JobContainer.GetActPath() @@ -158,7 +158,7 @@ func runStepExecutor(step step, stage stepStage, executor common.Executor) commo err = executor(timeoutctx) if err == nil { - logger.WithField("stepResult", stepResult.Outcome).Infof(" \u2705 Success - %s %s", stage, stepString) + logger.WithField("stepResult", stepResult.Outcome).Infof("Success - %s %s", stage, stepString) } else { stepResult.Outcome = model.StepStatusFailure @@ -169,6 +169,7 @@ func runStepExecutor(step step, stage stepStage, executor common.Executor) commo } if continueOnError { + logger.Errorf("##[error]%v", err) logger.Infof("Failed but continue next step") err = nil stepResult.Conclusion = model.StepStatusSuccess @@ -176,7 +177,9 @@ func runStepExecutor(step step, stage stepStage, executor common.Executor) commo stepResult.Conclusion = model.StepStatusFailure } - logger.WithField("stepResult", stepResult.Outcome).Errorf(" \u274C Failure - %s %s", stage, stepString) + // Infof: Errorf entries are promoted to the user log by the reporter, + // which would duplicate the ##[error] annotation emitted elsewhere. + logger.WithField("stepResult", stepResult.Outcome).Infof("Failure - %s %s", stage, stepString) } // Process Runner File Commands orgerr := err @@ -268,7 +271,7 @@ func isStepEnabled(ctx context.Context, expr string, step step, stage stepStage) runStep, err := EvalBool(ctx, rc.NewStepExpressionEvaluator(ctx, step), expr, defaultStatusCheck) if err != nil { - return false, fmt.Errorf(" \u274C Error in if-expression: \"if: %s\" (%s)", expr, err) + return false, fmt.Errorf("if-expression %q evaluation failed: %s", expr, err) } return runStep, nil @@ -284,7 +287,7 @@ func isContinueOnError(ctx context.Context, expr string, step step, _ stepStage) continueOnError, err := EvalBool(ctx, rc.NewStepExpressionEvaluator(ctx, step), expr, exprparser.DefaultStatusCheckNone) if err != nil { - return false, fmt.Errorf(" \u274C Error in continue-on-error-expression: \"continue-on-error: %s\" (%s)", expr, err) + return false, fmt.Errorf("continue-on-error expression %q evaluation failed: %s", expr, err) } return continueOnError, nil diff --git a/act/runner/testdata/local-action-via-composite-dockerfile/action.yml b/act/runner/testdata/local-action-via-composite-dockerfile/action.yml index c9a0527e..0770815a 100644 --- a/act/runner/testdata/local-action-via-composite-dockerfile/action.yml +++ b/act/runner/testdata/local-action-via-composite-dockerfile/action.yml @@ -32,7 +32,7 @@ runs: shell: bash - uses: ./localdockerimagetest_ # Also test a remote docker action here - - uses: actions/hello-world-docker-action@v1 + - uses: actions/hello-world-docker-action@v2 with: who-to-greet: 'Mona the Octocat' # Test if GITHUB_ACTION_PATH is set correctly after all steps diff --git a/go.mod b/go.mod index e1cf695a..37d49da0 100644 --- a/go.mod +++ b/go.mod @@ -13,7 +13,7 @@ require ( github.com/spf13/cobra v1.10.2 github.com/stretchr/testify v1.11.1 go.yaml.in/yaml/v4 v4.0.0-rc.3 - golang.org/x/term v0.41.0 + golang.org/x/term v0.42.0 golang.org/x/time v0.14.0 // indirect google.golang.org/protobuf v1.36.11 gopkg.in/yaml.v3 v3.0.1 // indirect @@ -108,7 +108,7 @@ require ( golang.org/x/crypto v0.49.0 // indirect golang.org/x/net v0.52.0 // indirect golang.org/x/sync v0.20.0 // indirect - golang.org/x/sys v0.42.0 // indirect + golang.org/x/sys v0.43.0 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20240903143218-8af14fe29dc1 // indirect google.golang.org/grpc v1.67.0 // indirect gopkg.in/warnings.v0 v0.1.2 // indirect diff --git a/go.sum b/go.sum index f9d3bb46..d6050005 100644 --- a/go.sum +++ b/go.sum @@ -311,11 +311,15 @@ golang.org/x/sys v0.41.0 h1:Ivj+2Cp/ylzLiEU89QhWblYnOE9zerudt9Ftecq2C6k= golang.org/x/sys v0.41.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo= golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= +golang.org/x/sys v0.43.0 h1:Rlag2XtaFTxp19wS8MXlJwTvoh8ArU6ezoyFsMyCTNI= +golang.org/x/sys v0.43.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.40.0 h1:36e4zGLqU4yhjlmxEaagx2KuYbJq3EwY8K943ZsHcvg= golang.org/x/term v0.40.0/go.mod h1:w2P8uVp06p2iyKKuvXIm7N/y0UCRt3UfJTfZ7oOpglM= golang.org/x/term v0.41.0 h1:QCgPso/Q3RTJx2Th4bDLqML4W6iJiaXFq2/ftQF13YU= golang.org/x/term v0.41.0/go.mod h1:3pfBgksrReYfZ5lvYM0kSO0LIkAl4Yl2bXOkKP7Ec2A= +golang.org/x/term v0.42.0 h1:UiKe+zDFmJobeJ5ggPwOshJIVt6/Ft0rcfrXZDLWAWY= +golang.org/x/term v0.42.0/go.mod h1:Dq/D+snpsbazcBG5+F9Q1n2rXV8Ma+71xEjTRufARgY= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= diff --git a/internal/app/poll/poller.go b/internal/app/poll/poller.go index 37ab7b28..9e3ea61a 100644 --- a/internal/app/poll/poller.go +++ b/internal/app/poll/poller.go @@ -27,6 +27,11 @@ type TaskRunner interface { Run(ctx context.Context, task *runnerv1.Task) error } +// IdleRunner can run maintenance while the poller is idle. +type IdleRunner interface { + OnIdle(ctx context.Context) +} + type Poller struct { client client.Client runner TaskRunner @@ -95,6 +100,7 @@ func (p *Poller) Poll() { task, ok := p.fetchTask(p.pollingCtx, s) if !ok { + p.runIdleMaintenance() <-sem if !p.waitBackoff(s) { return @@ -119,6 +125,7 @@ func (p *Poller) PollOnce() { for { task, ok := p.fetchTask(p.pollingCtx, s) if !ok { + p.runIdleMaintenance() if !p.waitBackoff(s) { return } @@ -130,6 +137,12 @@ func (p *Poller) PollOnce() { } } +func (p *Poller) runIdleMaintenance() { + if idleRunner, ok := p.runner.(IdleRunner); ok { + idleRunner.OnIdle(p.jobsCtx) + } +} + func (p *Poller) Shutdown(ctx context.Context) error { p.shutdownPolling() diff --git a/internal/app/poll/poller_test.go b/internal/app/poll/poller_test.go index e9d10d12..61e43c05 100644 --- a/internal/app/poll/poller_test.go +++ b/internal/app/poll/poller_test.go @@ -125,6 +125,11 @@ type mockRunner struct { totalCompleted atomic.Int64 } +type idleAwareRunner struct { + mockRunner + idleCalls atomic.Int64 +} + func (m *mockRunner) Run(ctx context.Context, _ *runnerv1.Task) error { atomicMax(&m.maxConcurrent, m.running.Add(1)) select { @@ -136,6 +141,78 @@ func (m *mockRunner) Run(ctx context.Context, _ *runnerv1.Task) error { return nil } +func TestPollerRunIdleMaintenance(t *testing.T) { + runner := &idleAwareRunner{} + p := &Poller{runner: runner, jobsCtx: context.Background()} + + p.runIdleMaintenance() + + assert.Equal(t, int64(1), runner.idleCalls.Load()) +} + +func (m *idleAwareRunner) OnIdle(_ context.Context) { + m.idleCalls.Add(1) +} + +func TestPollerPollCallsOnIdle(t *testing.T) { + cli := mocks.NewClient(t) + cli.On("FetchTask", mock.Anything, mock.Anything).Return( + func(_ context.Context, _ *connect_go.Request[runnerv1.FetchTaskRequest]) (*connect_go.Response[runnerv1.FetchTaskResponse], error) { + return connect_go.NewResponse(&runnerv1.FetchTaskResponse{}), nil + }, + ) + + cfg, err := config.LoadDefault("") + require.NoError(t, err) + cfg.Runner.Capacity = 1 + cfg.Runner.FetchInterval = 10 * time.Millisecond + cfg.Runner.FetchIntervalMax = 10 * time.Millisecond + + runner := &idleAwareRunner{} + poller := New(cfg, cli, runner) + + var wg sync.WaitGroup + wg.Go(poller.Poll) + + require.Eventually(t, func() bool { + return runner.idleCalls.Load() > 0 + }, time.Second, 10*time.Millisecond) + + ctx, cancel := context.WithTimeout(context.Background(), time.Second) + defer cancel() + require.NoError(t, poller.Shutdown(ctx)) + wg.Wait() +} + +func TestPollerPollOnceCallsOnIdle(t *testing.T) { + cli := mocks.NewClient(t) + cli.On("FetchTask", mock.Anything, mock.Anything).Return( + func(_ context.Context, _ *connect_go.Request[runnerv1.FetchTaskRequest]) (*connect_go.Response[runnerv1.FetchTaskResponse], error) { + return connect_go.NewResponse(&runnerv1.FetchTaskResponse{}), nil + }, + ) + + cfg, err := config.LoadDefault("") + require.NoError(t, err) + cfg.Runner.FetchInterval = 10 * time.Millisecond + cfg.Runner.FetchIntervalMax = 10 * time.Millisecond + + runner := &idleAwareRunner{} + poller := New(cfg, cli, runner) + + var wg sync.WaitGroup + wg.Go(poller.PollOnce) + + require.Eventually(t, func() bool { + return runner.idleCalls.Load() > 0 + }, time.Second, 10*time.Millisecond) + + ctx, cancel := context.WithTimeout(context.Background(), time.Second) + defer cancel() + require.NoError(t, poller.Shutdown(ctx)) + wg.Wait() +} + // TestPoller_ConcurrencyLimitedByCapacity verifies that with capacity=3 and // 6 available tasks, at most 3 tasks run concurrently, and FetchTask is // never called concurrently (single poller). diff --git a/internal/app/run/runner.go b/internal/app/run/runner.go index f03e5e25..bb67ef9d 100644 --- a/internal/app/run/runner.go +++ b/internal/app/run/runner.go @@ -7,11 +7,14 @@ import ( "bytes" "context" "encoding/json" + "errors" "fmt" "maps" "net/http" "os" "path/filepath" + "runtime" + "strconv" "strings" "sync" "sync/atomic" @@ -45,8 +48,10 @@ type Runner struct { envs map[string]string cacheHandler *artifactcache.Handler - runningTasks sync.Map - runningCount atomic.Int64 + runningTasks sync.Map + runningCount atomic.Int64 + lastIdleCleanupUnixNano atomic.Int64 + now func() time.Time } func NewRunner(cfg *config.Config, reg *config.Registration, cli client.Client) *Runner { @@ -89,13 +94,94 @@ func NewRunner(cfg *config.Config, reg *config.Registration, cli client.Client) envs["GITEA_ACTIONS"] = "true" envs["GITEA_ACTIONS_RUNNER_VERSION"] = ver.Version() - return &Runner{ + runner := &Runner{ name: reg.Name, cfg: cfg, client: cli, labels: ls, envs: envs, cacheHandler: cacheHandler, + now: time.Now, + } + return runner +} + +// OnIdle performs lightweight maintenance during polling idle windows. +// It runs synchronously on the poller goroutine; shouldRunIdleCleanup +// throttles invocations to runner.idle_cleanup_interval so the impact on +// poll cadence is bounded even when the workdir root is large. +func (r *Runner) OnIdle(ctx context.Context) { + if !r.shouldRunIdleCleanup() { + return + } + workdirParent := strings.TrimLeft(r.cfg.Container.WorkdirParent, "/") + workdirRoot := filepath.FromSlash("/" + workdirParent) + r.cleanupStaleTaskDirs(ctx, workdirRoot) +} + +func (r *Runner) shouldRunIdleCleanup() bool { + if !r.cfg.Container.BindWorkdir { + return false + } + if r.cfg.Runner.WorkdirCleanupAge <= 0 || r.cfg.Runner.IdleCleanupInterval <= 0 { + return false + } + if r.RunningCount() != 0 { + return false + } + now := r.now() + interval := r.cfg.Runner.IdleCleanupInterval + for { + last := r.lastIdleCleanupUnixNano.Load() + if last != 0 && now.Sub(time.Unix(0, last)) < interval { + return false + } + if r.lastIdleCleanupUnixNano.CompareAndSwap(last, now.UnixNano()) { + return true + } + } +} + +func (r *Runner) cleanupStaleTaskDirs(ctx context.Context, workdirRoot string) { + entries, err := os.ReadDir(workdirRoot) + if err != nil { + if errors.Is(err, os.ErrNotExist) { + return + } + log.Warnf("failed to list task workspace root %s for stale cleanup: %v", workdirRoot, err) + return + } + + // A task may begin between shouldRunIdleCleanup's running-count check and + // the loop below. That is safe because new task dirs are created with the + // current mtime and therefore fall on the keep side of cutoff. + cutoff := r.now().Add(-r.cfg.Runner.WorkdirCleanupAge) + for _, entry := range entries { + if err := ctx.Err(); err != nil { + return + } + if !entry.IsDir() { + continue + } + // Task workspaces are indexed by numeric task IDs; skip any other + // directories to avoid deleting operator-managed data under workdir_root. + if _, err := strconv.ParseUint(entry.Name(), 10, 64); err != nil { + continue + } + info, err := entry.Info() + if err != nil { + log.Warnf("failed to stat task workspace %s: %v", filepath.Join(workdirRoot, entry.Name()), err) + continue + } + if info.ModTime().After(cutoff) { + continue + } + taskDir := filepath.Join(workdirRoot, entry.Name()) + if err := os.RemoveAll(taskDir); err != nil { + log.Warnf("failed to clean stale task workspace %s: %v", taskDir, err) + continue + } + log.Infof("cleaned stale task workspace %s", taskDir) } } @@ -238,6 +324,13 @@ func (r *Runner) run(ctx context.Context, task *runnerv1.Task, reporter *report. workdirParent = fmt.Sprintf("%s/%d", workdirParent, task.Id) } workdir := filepath.FromSlash(fmt.Sprintf("/%s/%s", workdirParent, preset.Repository)) + if runtime.GOOS == "windows" { + if abs, err := filepath.Abs(workdir); err == nil { + workdir = abs + } + } + // Without bind_workdir, the workspace path omits the task id; concurrent host-mode jobs + // for the same repository would share this directory and can race with per-job cleanup. runnerConfig := &runner.Config{ // On Linux, Workdir will be like "///" diff --git a/internal/app/run/runner_idle_cleanup_test.go b/internal/app/run/runner_idle_cleanup_test.go new file mode 100644 index 00000000..8f1dce47 --- /dev/null +++ b/internal/app/run/runner_idle_cleanup_test.go @@ -0,0 +1,247 @@ +// Copyright 2026 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package run + +import ( + "context" + "os" + "path/filepath" + "strconv" + "testing" + "time" + + "gitea.com/gitea/runner/internal/pkg/config" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestRunnerCleanupStaleTaskDirs(t *testing.T) { + now := time.Date(2026, time.April, 29, 20, 0, 0, 0, time.UTC) + workdirRoot := filepath.Join(t.TempDir(), "workspace") + require.NoError(t, os.MkdirAll(workdirRoot, 0o700)) + + oldTask := filepath.Join(workdirRoot, "1001") + freshTask := filepath.Join(workdirRoot, "1002") + nonTask := filepath.Join(workdirRoot, "shared") + alphaNumericTask := filepath.Join(workdirRoot, "123abc") + for _, path := range []string{oldTask, freshTask, nonTask, alphaNumericTask} { + require.NoError(t, os.MkdirAll(path, 0o700)) + } + + require.NoError(t, os.Chtimes(oldTask, now.Add(-3*time.Hour), now.Add(-3*time.Hour))) + require.NoError(t, os.Chtimes(freshTask, now.Add(-30*time.Minute), now.Add(-30*time.Minute))) + require.NoError(t, os.Chtimes(nonTask, now.Add(-5*time.Hour), now.Add(-5*time.Hour))) + require.NoError(t, os.Chtimes(alphaNumericTask, now.Add(-5*time.Hour), now.Add(-5*time.Hour))) + + r := &Runner{ + cfg: &config.Config{ + Runner: config.Runner{ + WorkdirCleanupAge: 2 * time.Hour, + }, + }, + now: func() time.Time { return now }, + } + + r.cleanupStaleTaskDirs(context.Background(), workdirRoot) + + assert.NoDirExists(t, oldTask) + assert.DirExists(t, freshTask) + assert.DirExists(t, nonTask) + assert.DirExists(t, alphaNumericTask) +} + +func TestRunnerCleanupStaleTaskDirsMissingRoot(t *testing.T) { + r := &Runner{ + cfg: &config.Config{ + Runner: config.Runner{WorkdirCleanupAge: time.Hour}, + }, + now: time.Now, + } + + // Must be a silent no-op rather than a warning or panic when the root + // has not yet been created (e.g. the runner has never executed a task). + r.cleanupStaleTaskDirs(context.Background(), filepath.Join(t.TempDir(), "missing")) +} + +func TestRunnerCleanupStaleTaskDirsHonorsContext(t *testing.T) { + now := time.Date(2026, time.April, 29, 20, 0, 0, 0, time.UTC) + workdirRoot := filepath.Join(t.TempDir(), "workspace") + require.NoError(t, os.MkdirAll(workdirRoot, 0o700)) + + for i := 1001; i <= 1003; i++ { + dir := filepath.Join(workdirRoot, strconv.Itoa(i)) + require.NoError(t, os.MkdirAll(dir, 0o700)) + require.NoError(t, os.Chtimes(dir, now.Add(-3*time.Hour), now.Add(-3*time.Hour))) + } + + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + r := &Runner{ + cfg: &config.Config{ + Runner: config.Runner{WorkdirCleanupAge: time.Hour}, + }, + now: func() time.Time { return now }, + } + + r.cleanupStaleTaskDirs(ctx, workdirRoot) + + for i := 1001; i <= 1003; i++ { + assert.DirExists(t, filepath.Join(workdirRoot, strconv.Itoa(i))) + } +} + +func TestRunnerShouldRunIdleCleanupThrottles(t *testing.T) { + now := time.Date(2026, time.April, 29, 20, 0, 0, 0, time.UTC) + r := &Runner{ + cfg: &config.Config{ + Container: config.Container{ + BindWorkdir: true, + }, + Runner: config.Runner{ + WorkdirCleanupAge: 24 * time.Hour, + IdleCleanupInterval: time.Hour, + }, + }, + now: func() time.Time { return now }, + } + + assert.True(t, r.shouldRunIdleCleanup()) + + now = now.Add(30 * time.Minute) + assert.False(t, r.shouldRunIdleCleanup()) + + now = now.Add(31 * time.Minute) + assert.True(t, r.shouldRunIdleCleanup()) +} + +func TestRunnerShouldRunIdleCleanupSkipsWhenJobRunning(t *testing.T) { + r := &Runner{ + cfg: &config.Config{ + Container: config.Container{ + BindWorkdir: true, + }, + Runner: config.Runner{ + WorkdirCleanupAge: 24 * time.Hour, + IdleCleanupInterval: time.Minute, + }, + }, + now: time.Now, + } + r.runningCount.Store(1) + + assert.False(t, r.shouldRunIdleCleanup()) +} + +func TestRunnerShouldRunIdleCleanupSkipsWhenBindWorkdirDisabled(t *testing.T) { + r := &Runner{ + cfg: &config.Config{ + Runner: config.Runner{ + WorkdirCleanupAge: 24 * time.Hour, + IdleCleanupInterval: time.Minute, + }, + }, + now: time.Now, + } + + assert.False(t, r.shouldRunIdleCleanup()) +} + +func TestRunnerShouldRunIdleCleanupSkipsWhenDisabled(t *testing.T) { + now := time.Date(2026, time.April, 29, 20, 0, 0, 0, time.UTC) + + t.Run("cleanup age disabled", func(t *testing.T) { + r := &Runner{ + cfg: &config.Config{ + Container: config.Container{ + BindWorkdir: true, + }, + Runner: config.Runner{ + WorkdirCleanupAge: -1, + IdleCleanupInterval: time.Minute, + }, + }, + now: func() time.Time { return now }, + } + + assert.False(t, r.shouldRunIdleCleanup()) + }) + + t.Run("idle interval disabled", func(t *testing.T) { + r := &Runner{ + cfg: &config.Config{ + Container: config.Container{ + BindWorkdir: true, + }, + Runner: config.Runner{ + WorkdirCleanupAge: 24 * time.Hour, + IdleCleanupInterval: -1, + }, + }, + now: func() time.Time { return now }, + } + + assert.False(t, r.shouldRunIdleCleanup()) + }) +} + +// TestRunnerOnIdleIntegratesCleanup wires the full OnIdle entry point and +// confirms it walks workdir_parent (after the leading-slash trim that +// matches the production path construction) and removes stale numeric dirs. +func TestRunnerOnIdleIntegratesCleanup(t *testing.T) { + now := time.Date(2026, time.April, 29, 20, 0, 0, 0, time.UTC) + root := t.TempDir() + stale := filepath.Join(root, "1234") + require.NoError(t, os.MkdirAll(stale, 0o700)) + require.NoError(t, os.Chtimes(stale, now.Add(-48*time.Hour), now.Add(-48*time.Hour))) + + r := &Runner{ + cfg: &config.Config{ + Container: config.Container{ + BindWorkdir: true, + WorkdirParent: root, // leading slash absent, OnIdle reattaches it + }, + Runner: config.Runner{ + WorkdirCleanupAge: 24 * time.Hour, + IdleCleanupInterval: time.Minute, + }, + }, + now: func() time.Time { return now }, + } + + r.OnIdle(context.Background()) + + assert.NoDirExists(t, stale) +} + +// TestRunnerOnIdleSkipsWhenAlreadyCancelled verifies a pre-cancelled ctx +// short-circuits cleanup before any directory entry is touched. +func TestRunnerOnIdleSkipsWhenAlreadyCancelled(t *testing.T) { + now := time.Date(2026, time.April, 29, 20, 0, 0, 0, time.UTC) + root := t.TempDir() + stale := filepath.Join(root, "1234") + require.NoError(t, os.MkdirAll(stale, 0o700)) + require.NoError(t, os.Chtimes(stale, now.Add(-48*time.Hour), now.Add(-48*time.Hour))) + + r := &Runner{ + cfg: &config.Config{ + Container: config.Container{ + BindWorkdir: true, + WorkdirParent: root, + }, + Runner: config.Runner{ + WorkdirCleanupAge: 24 * time.Hour, + IdleCleanupInterval: time.Minute, + }, + }, + now: func() time.Time { return now }, + } + + ctx, cancel := context.WithCancel(context.Background()) + cancel() + r.OnIdle(ctx) + + assert.DirExists(t, stale) +} diff --git a/internal/pkg/config/config.example.yaml b/internal/pkg/config/config.example.yaml index d1551d5b..eded51c3 100644 --- a/internal/pkg/config/config.example.yaml +++ b/internal/pkg/config/config.example.yaml @@ -40,6 +40,12 @@ runner: # The runner uses exponential backoff when idle, increasing the interval up to this maximum. # Set to 0 or same as fetch_interval to disable backoff. fetch_interval_max: 5s + # While idle, remove stale bind-workdir task directories older than this duration. + # Setting either workdir_cleanup_age or idle_cleanup_interval to 0 (or any + # non-positive value) disables workdir cleanup entirely. + workdir_cleanup_age: 24h + # Cadence for the idle stale bind-workdir cleanup pass. + idle_cleanup_interval: 10m # The base interval for periodic log flush to the Gitea instance. # Logs may be sent earlier if the buffer reaches log_report_batch_size # or if log_report_max_latency expires after the first buffered row. @@ -107,6 +113,7 @@ container: # If the path starts with '/', the '/' will be trimmed. # For example, if the parent directory is /path/to/my/dir, workdir_parent should be path/to/my/dir # If it's empty, /workspace will be used. + # Purely numeric subdirectories under this path are reserved for task workspaces and may be removed by idle cleanup. workdir_parent: # Volumes (including bind mounts) can be mounted to containers. Glob syntax is supported, see https://github.com/gobwas/glob # You can specify multiple volumes. If the sequence is empty, no volumes can be mounted. diff --git a/internal/pkg/config/config.go b/internal/pkg/config/config.go index 48aa9dab..a257a9dd 100644 --- a/internal/pkg/config/config.go +++ b/internal/pkg/config/config.go @@ -33,6 +33,8 @@ type Runner struct { FetchTimeout time.Duration `yaml:"fetch_timeout"` // FetchTimeout specifies the timeout duration for fetching resources. FetchInterval time.Duration `yaml:"fetch_interval"` // FetchInterval specifies the interval duration for fetching resources. FetchIntervalMax time.Duration `yaml:"fetch_interval_max"` // FetchIntervalMax specifies the maximum backoff interval when idle. + WorkdirCleanupAge time.Duration `yaml:"workdir_cleanup_age"` // WorkdirCleanupAge removes stale bind-workdir task directories older than this duration during idle cleanup. + IdleCleanupInterval time.Duration `yaml:"idle_cleanup_interval"` // IdleCleanupInterval runs stale bind-workdir cleanup periodically while the runner is idle. Set to 0 to disable cleanup cadence. LogReportInterval time.Duration `yaml:"log_report_interval"` // LogReportInterval specifies the base interval for periodic log flush. LogReportMaxLatency time.Duration `yaml:"log_report_max_latency"` // LogReportMaxLatency specifies the max time a log row can wait before being sent. LogReportBatchSize int `yaml:"log_report_batch_size"` // LogReportBatchSize triggers immediate log flush when buffer reaches this size. @@ -92,6 +94,7 @@ type Config struct { // If file is not empty, it will be used to load the configuration. func LoadDefault(file string) (*Config, error) { cfg := &Config{} + definedRunnerKeys := map[string]bool{} if file != "" { content, err := os.ReadFile(file) if err != nil { @@ -100,6 +103,10 @@ func LoadDefault(file string) (*Config, error) { if err := yaml.Unmarshal(content, cfg); err != nil { return nil, fmt.Errorf("parse config file %q: %w", file, err) } + definedRunnerKeys, err = definedRunnerConfigKeys(content) + if err != nil { + return nil, fmt.Errorf("parse config file %q for defaults metadata: %w", file, err) + } } compatibleWithOldEnvs(file != "", cfg) @@ -157,6 +164,12 @@ func LoadDefault(file string) (*Config, error) { if cfg.Runner.FetchIntervalMax <= 0 { cfg.Runner.FetchIntervalMax = 5 * time.Second } + if cfg.Runner.WorkdirCleanupAge == 0 && !definedRunnerKeys["workdir_cleanup_age"] { + cfg.Runner.WorkdirCleanupAge = 24 * time.Hour + } + if cfg.Runner.IdleCleanupInterval == 0 && !definedRunnerKeys["idle_cleanup_interval"] { + cfg.Runner.IdleCleanupInterval = 10 * time.Minute + } if cfg.Runner.LogReportInterval <= 0 { cfg.Runner.LogReportInterval = 5 * time.Second } @@ -199,3 +212,30 @@ func LoadDefault(file string) (*Config, error) { return cfg, nil } + +func definedRunnerConfigKeys(content []byte) (map[string]bool, error) { + var root yaml.Node + if err := yaml.Unmarshal(content, &root); err != nil { + return nil, err + } + + defined := map[string]bool{} + if len(root.Content) == 0 { + return defined, nil + } + + doc := root.Content[0] + for i := 0; i+1 < len(doc.Content); i += 2 { + key := doc.Content[i] + value := doc.Content[i+1] + if key.Value != "runner" || value.Kind != yaml.MappingNode { + continue + } + for j := 0; j+1 < len(value.Content); j += 2 { + defined[value.Content[j].Value] = true + } + break + } + + return defined, nil +} diff --git a/internal/pkg/config/config_test.go b/internal/pkg/config/config_test.go index da0e414b..4986ee94 100644 --- a/internal/pkg/config/config_test.go +++ b/internal/pkg/config/config_test.go @@ -7,6 +7,7 @@ import ( "os" "path/filepath" "testing" + "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -39,3 +40,80 @@ cache: _, err := LoadDefault(path) require.NoError(t, err) } + +func TestLoadDefault_DefaultsWorkdirCleanupAge(t *testing.T) { + cfg, err := LoadDefault("") + require.NoError(t, err) + assert.Equal(t, 24*time.Hour, cfg.Runner.WorkdirCleanupAge) + assert.Equal(t, 10*time.Minute, cfg.Runner.IdleCleanupInterval) +} + +func TestLoadDefault_UsesConfiguredWorkdirCleanupAge(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "config.yaml") + require.NoError(t, os.WriteFile(path, []byte(` +runner: + workdir_cleanup_age: 2h30m +`), 0o600)) + + cfg, err := LoadDefault(path) + require.NoError(t, err) + assert.Equal(t, 150*time.Minute, cfg.Runner.WorkdirCleanupAge) +} + +func TestLoadDefault_UsesConfiguredIdleCleanupInterval(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "config.yaml") + require.NoError(t, os.WriteFile(path, []byte(` +runner: + idle_cleanup_interval: 45m +`), 0o600)) + + cfg, err := LoadDefault(path) + require.NoError(t, err) + assert.Equal(t, 45*time.Minute, cfg.Runner.IdleCleanupInterval) +} + +func TestLoadDefault_AllowsDisablingWorkdirCleanup(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "config.yaml") + require.NoError(t, os.WriteFile(path, []byte(` +runner: + workdir_cleanup_age: 0s + idle_cleanup_interval: 0s +`), 0o600)) + + cfg, err := LoadDefault(path) + require.NoError(t, err) + assert.Equal(t, time.Duration(0), cfg.Runner.WorkdirCleanupAge) + assert.Equal(t, time.Duration(0), cfg.Runner.IdleCleanupInterval) +} + +func TestLoadDefault_AllowsNegativeWorkdirCleanupValues(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "config.yaml") + require.NoError(t, os.WriteFile(path, []byte(` +runner: + workdir_cleanup_age: -1s + idle_cleanup_interval: -1s +`), 0o600)) + + cfg, err := LoadDefault(path) + require.NoError(t, err) + assert.Equal(t, -1*time.Second, cfg.Runner.WorkdirCleanupAge) + assert.Equal(t, -1*time.Second, cfg.Runner.IdleCleanupInterval) +} + +// TestLoadDefault_MalformedYAMLReturnsParseError pins the error surfaced for +// invalid YAML to the canonical "parse config file" message rather than the +// "for defaults metadata" variant — i.e. the main yaml.Unmarshal runs first. +func TestLoadDefault_MalformedYAMLReturnsParseError(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "config.yaml") + require.NoError(t, os.WriteFile(path, []byte("runner:\n capacity: [unterminated\n"), 0o600)) + + _, err := LoadDefault(path) + require.Error(t, err) + assert.Contains(t, err.Error(), "parse config file") + assert.NotContains(t, err.Error(), "defaults metadata") +}