feat: add startup janitor for stale bind-workdir task workspaces (#870)

- Add idle-time cleanup for stale bind-workdir task directories instead of cleaning them on the task execution path.
- Make cleanup behavior configurable with `runner.startup_cleanup_age` as the stale-age threshold (default: `24h`) and `runner.idle_cleanup_interval` as the idle cleanup cadence (default: `10m`).
- Restrict cleanup scope to numeric task directory names only, to avoid touching operator-managed folders.
- Document the cleanup settings in `config.example.yaml` and `README.md`.
- Add tests for stale-directory cleanup, idle cleanup throttling, and config default/override parsing.

## Why

When a runner or host crashes, normal per-task cleanup may not run, leaving stale task directories under the bind-workdir root. Running this cleanup only while the runner is idle recovers that disk space without adding overhead to active job execution.

If you want, I can also tighten the wording around `startup_cleanup_age`, since the key name now reads a bit misleadingly relative to the actual behavior.

---------

Co-authored-by: silverwind <me@silverwind.io>
Reviewed-on: https://gitea.com/gitea/runner/pulls/870
Reviewed-by: silverwind <2021+silverwind@noreply.gitea.com>
This commit is contained in:
Nicolas
2026-05-05 20:11:44 +00:00
parent a22119cf88
commit 2a4d56c650
8 changed files with 556 additions and 3 deletions

View File

@@ -7,12 +7,14 @@ import (
"bytes"
"context"
"encoding/json"
"errors"
"fmt"
"maps"
"net/http"
"os"
"path/filepath"
"runtime"
"strconv"
"strings"
"sync"
"sync/atomic"
@@ -46,8 +48,10 @@ type Runner struct {
envs map[string]string
cacheHandler *artifactcache.Handler
runningTasks sync.Map
runningCount atomic.Int64
runningTasks sync.Map
runningCount atomic.Int64
lastIdleCleanupUnixNano atomic.Int64
now func() time.Time
}
func NewRunner(cfg *config.Config, reg *config.Registration, cli client.Client) *Runner {
@@ -90,13 +94,94 @@ func NewRunner(cfg *config.Config, reg *config.Registration, cli client.Client)
envs["GITEA_ACTIONS"] = "true"
envs["GITEA_ACTIONS_RUNNER_VERSION"] = ver.Version()
return &Runner{
runner := &Runner{
name: reg.Name,
cfg: cfg,
client: cli,
labels: ls,
envs: envs,
cacheHandler: cacheHandler,
now: time.Now,
}
return runner
}
// OnIdle performs lightweight maintenance during polling idle windows.
// It runs synchronously on the poller goroutine; shouldRunIdleCleanup
// throttles invocations to runner.idle_cleanup_interval so the impact on
// poll cadence is bounded even when the workdir root is large.
func (r *Runner) OnIdle(ctx context.Context) {
if !r.shouldRunIdleCleanup() {
return
}
workdirParent := strings.TrimLeft(r.cfg.Container.WorkdirParent, "/")
workdirRoot := filepath.FromSlash("/" + workdirParent)
r.cleanupStaleTaskDirs(ctx, workdirRoot)
}
func (r *Runner) shouldRunIdleCleanup() bool {
if !r.cfg.Container.BindWorkdir {
return false
}
if r.cfg.Runner.WorkdirCleanupAge <= 0 || r.cfg.Runner.IdleCleanupInterval <= 0 {
return false
}
if r.RunningCount() != 0 {
return false
}
now := r.now()
interval := r.cfg.Runner.IdleCleanupInterval
for {
last := r.lastIdleCleanupUnixNano.Load()
if last != 0 && now.Sub(time.Unix(0, last)) < interval {
return false
}
if r.lastIdleCleanupUnixNano.CompareAndSwap(last, now.UnixNano()) {
return true
}
}
}
func (r *Runner) cleanupStaleTaskDirs(ctx context.Context, workdirRoot string) {
entries, err := os.ReadDir(workdirRoot)
if err != nil {
if errors.Is(err, os.ErrNotExist) {
return
}
log.Warnf("failed to list task workspace root %s for stale cleanup: %v", workdirRoot, err)
return
}
// A task may begin between shouldRunIdleCleanup's running-count check and
// the loop below. That is safe because new task dirs are created with the
// current mtime and therefore fall on the keep side of cutoff.
cutoff := r.now().Add(-r.cfg.Runner.WorkdirCleanupAge)
for _, entry := range entries {
if err := ctx.Err(); err != nil {
return
}
if !entry.IsDir() {
continue
}
// Task workspaces are indexed by numeric task IDs; skip any other
// directories to avoid deleting operator-managed data under workdir_root.
if _, err := strconv.ParseUint(entry.Name(), 10, 64); err != nil {
continue
}
info, err := entry.Info()
if err != nil {
log.Warnf("failed to stat task workspace %s: %v", filepath.Join(workdirRoot, entry.Name()), err)
continue
}
if info.ModTime().After(cutoff) {
continue
}
taskDir := filepath.Join(workdirRoot, entry.Name())
if err := os.RemoveAll(taskDir); err != nil {
log.Warnf("failed to clean stale task workspace %s: %v", taskDir, err)
continue
}
log.Infof("cleaned stale task workspace %s", taskDir)
}
}

View File

@@ -0,0 +1,247 @@
// Copyright 2026 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package run
import (
"context"
"os"
"path/filepath"
"strconv"
"testing"
"time"
"gitea.com/gitea/runner/internal/pkg/config"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestRunnerCleanupStaleTaskDirs(t *testing.T) {
now := time.Date(2026, time.April, 29, 20, 0, 0, 0, time.UTC)
workdirRoot := filepath.Join(t.TempDir(), "workspace")
require.NoError(t, os.MkdirAll(workdirRoot, 0o700))
oldTask := filepath.Join(workdirRoot, "1001")
freshTask := filepath.Join(workdirRoot, "1002")
nonTask := filepath.Join(workdirRoot, "shared")
alphaNumericTask := filepath.Join(workdirRoot, "123abc")
for _, path := range []string{oldTask, freshTask, nonTask, alphaNumericTask} {
require.NoError(t, os.MkdirAll(path, 0o700))
}
require.NoError(t, os.Chtimes(oldTask, now.Add(-3*time.Hour), now.Add(-3*time.Hour)))
require.NoError(t, os.Chtimes(freshTask, now.Add(-30*time.Minute), now.Add(-30*time.Minute)))
require.NoError(t, os.Chtimes(nonTask, now.Add(-5*time.Hour), now.Add(-5*time.Hour)))
require.NoError(t, os.Chtimes(alphaNumericTask, now.Add(-5*time.Hour), now.Add(-5*time.Hour)))
r := &Runner{
cfg: &config.Config{
Runner: config.Runner{
WorkdirCleanupAge: 2 * time.Hour,
},
},
now: func() time.Time { return now },
}
r.cleanupStaleTaskDirs(context.Background(), workdirRoot)
assert.NoDirExists(t, oldTask)
assert.DirExists(t, freshTask)
assert.DirExists(t, nonTask)
assert.DirExists(t, alphaNumericTask)
}
func TestRunnerCleanupStaleTaskDirsMissingRoot(t *testing.T) {
r := &Runner{
cfg: &config.Config{
Runner: config.Runner{WorkdirCleanupAge: time.Hour},
},
now: time.Now,
}
// Must be a silent no-op rather than a warning or panic when the root
// has not yet been created (e.g. the runner has never executed a task).
r.cleanupStaleTaskDirs(context.Background(), filepath.Join(t.TempDir(), "missing"))
}
func TestRunnerCleanupStaleTaskDirsHonorsContext(t *testing.T) {
now := time.Date(2026, time.April, 29, 20, 0, 0, 0, time.UTC)
workdirRoot := filepath.Join(t.TempDir(), "workspace")
require.NoError(t, os.MkdirAll(workdirRoot, 0o700))
for i := 1001; i <= 1003; i++ {
dir := filepath.Join(workdirRoot, strconv.Itoa(i))
require.NoError(t, os.MkdirAll(dir, 0o700))
require.NoError(t, os.Chtimes(dir, now.Add(-3*time.Hour), now.Add(-3*time.Hour)))
}
ctx, cancel := context.WithCancel(context.Background())
cancel()
r := &Runner{
cfg: &config.Config{
Runner: config.Runner{WorkdirCleanupAge: time.Hour},
},
now: func() time.Time { return now },
}
r.cleanupStaleTaskDirs(ctx, workdirRoot)
for i := 1001; i <= 1003; i++ {
assert.DirExists(t, filepath.Join(workdirRoot, strconv.Itoa(i)))
}
}
func TestRunnerShouldRunIdleCleanupThrottles(t *testing.T) {
now := time.Date(2026, time.April, 29, 20, 0, 0, 0, time.UTC)
r := &Runner{
cfg: &config.Config{
Container: config.Container{
BindWorkdir: true,
},
Runner: config.Runner{
WorkdirCleanupAge: 24 * time.Hour,
IdleCleanupInterval: time.Hour,
},
},
now: func() time.Time { return now },
}
assert.True(t, r.shouldRunIdleCleanup())
now = now.Add(30 * time.Minute)
assert.False(t, r.shouldRunIdleCleanup())
now = now.Add(31 * time.Minute)
assert.True(t, r.shouldRunIdleCleanup())
}
func TestRunnerShouldRunIdleCleanupSkipsWhenJobRunning(t *testing.T) {
r := &Runner{
cfg: &config.Config{
Container: config.Container{
BindWorkdir: true,
},
Runner: config.Runner{
WorkdirCleanupAge: 24 * time.Hour,
IdleCleanupInterval: time.Minute,
},
},
now: time.Now,
}
r.runningCount.Store(1)
assert.False(t, r.shouldRunIdleCleanup())
}
func TestRunnerShouldRunIdleCleanupSkipsWhenBindWorkdirDisabled(t *testing.T) {
r := &Runner{
cfg: &config.Config{
Runner: config.Runner{
WorkdirCleanupAge: 24 * time.Hour,
IdleCleanupInterval: time.Minute,
},
},
now: time.Now,
}
assert.False(t, r.shouldRunIdleCleanup())
}
func TestRunnerShouldRunIdleCleanupSkipsWhenDisabled(t *testing.T) {
now := time.Date(2026, time.April, 29, 20, 0, 0, 0, time.UTC)
t.Run("cleanup age disabled", func(t *testing.T) {
r := &Runner{
cfg: &config.Config{
Container: config.Container{
BindWorkdir: true,
},
Runner: config.Runner{
WorkdirCleanupAge: -1,
IdleCleanupInterval: time.Minute,
},
},
now: func() time.Time { return now },
}
assert.False(t, r.shouldRunIdleCleanup())
})
t.Run("idle interval disabled", func(t *testing.T) {
r := &Runner{
cfg: &config.Config{
Container: config.Container{
BindWorkdir: true,
},
Runner: config.Runner{
WorkdirCleanupAge: 24 * time.Hour,
IdleCleanupInterval: -1,
},
},
now: func() time.Time { return now },
}
assert.False(t, r.shouldRunIdleCleanup())
})
}
// TestRunnerOnIdleIntegratesCleanup wires the full OnIdle entry point and
// confirms it walks workdir_parent (after the leading-slash trim that
// matches the production path construction) and removes stale numeric dirs.
func TestRunnerOnIdleIntegratesCleanup(t *testing.T) {
now := time.Date(2026, time.April, 29, 20, 0, 0, 0, time.UTC)
root := t.TempDir()
stale := filepath.Join(root, "1234")
require.NoError(t, os.MkdirAll(stale, 0o700))
require.NoError(t, os.Chtimes(stale, now.Add(-48*time.Hour), now.Add(-48*time.Hour)))
r := &Runner{
cfg: &config.Config{
Container: config.Container{
BindWorkdir: true,
WorkdirParent: root, // leading slash absent, OnIdle reattaches it
},
Runner: config.Runner{
WorkdirCleanupAge: 24 * time.Hour,
IdleCleanupInterval: time.Minute,
},
},
now: func() time.Time { return now },
}
r.OnIdle(context.Background())
assert.NoDirExists(t, stale)
}
// TestRunnerOnIdleSkipsWhenAlreadyCancelled verifies a pre-cancelled ctx
// short-circuits cleanup before any directory entry is touched.
func TestRunnerOnIdleSkipsWhenAlreadyCancelled(t *testing.T) {
now := time.Date(2026, time.April, 29, 20, 0, 0, 0, time.UTC)
root := t.TempDir()
stale := filepath.Join(root, "1234")
require.NoError(t, os.MkdirAll(stale, 0o700))
require.NoError(t, os.Chtimes(stale, now.Add(-48*time.Hour), now.Add(-48*time.Hour)))
r := &Runner{
cfg: &config.Config{
Container: config.Container{
BindWorkdir: true,
WorkdirParent: root,
},
Runner: config.Runner{
WorkdirCleanupAge: 24 * time.Hour,
IdleCleanupInterval: time.Minute,
},
},
now: func() time.Time { return now },
}
ctx, cancel := context.WithCancel(context.Background())
cancel()
r.OnIdle(ctx)
assert.DirExists(t, stale)
}