1 Commits

Author SHA1 Message Date
Renovate Bot
f223fa44ea fix(deps): update module go.yaml.in/yaml/v4 to v4.0.0-rc.5 2026-06-14 00:15:39 +00:00
8 changed files with 34 additions and 303 deletions

View File

@@ -429,24 +429,6 @@ func (e *HostEnvironment) UpdateFromEnv(srcPath string, env *map[string]string)
return parseEnvFile(e, srcPath, env)
}
// removeAll is the filesystem delete used by removeAllWithContext. A package
// var so tests can substitute a blocking stub without patching os.RemoveAll.
var removeAll = os.RemoveAll
// removeAllWithContext runs removeAll in a goroutine and returns once it
// finishes or ctx is cancelled. On cancellation the goroutine is left running —
// a delete blocked inside a syscall cannot be interrupted (see runWithTimeout).
func removeAllWithContext(ctx context.Context, path string) error {
done := make(chan error, 1)
go func() { done <- removeAll(path) }()
select {
case err := <-done:
return err
case <-ctx.Done():
return ctx.Err()
}
}
func removePathWithRetry(ctx context.Context, path string) error {
if path == "" {
return nil
@@ -466,13 +448,10 @@ func removePathWithRetry(ctx context.Context, path string) error {
case <-time.After(delay):
}
}
lastErr = removeAllWithContext(ctx, path)
lastErr = os.RemoveAll(path)
if lastErr == nil {
return nil
}
if errors.Is(lastErr, context.DeadlineExceeded) {
return lastErr
}
}
return lastErr
}
@@ -554,61 +533,23 @@ func (e *HostEnvironment) terminateRunningProcesses(ctx context.Context) {
}
}
// hostCleanupTimeout bounds each filesystem-teardown phase of the host
// environment so a single stalled delete cannot wedge the runner slot forever.
// A var (not const) so tests can shrink it.
var hostCleanupTimeout = 30 * time.Second
// runWithTimeout runs fn in a goroutine and returns once it finishes or timeout
// elapses, whichever comes first. On timeout the goroutine is left running — an
// os.RemoveAll blocked inside a delete syscall (AV/EDR filter drivers, an
// unresponsive network mount, a dying disk) cannot be interrupted — and
// context.DeadlineExceeded is returned. Leaking the goroutine and the scratch
// state it was deleting is strictly better than blocking the caller forever and
// permanently losing the runner's capacity slot; the leaked scratch dir is
// reclaimed later by the runner's idle stale-dir sweep.
func runWithTimeout(fn func(), timeout time.Duration) error {
done := make(chan struct{})
go func() {
defer close(done)
fn()
}()
timer := time.NewTimer(timeout)
defer timer.Stop()
select {
case <-done:
return nil
case <-timer.C:
return context.DeadlineExceeded
}
}
func (e *HostEnvironment) Remove() common.Executor {
return func(ctx context.Context) error {
logger := common.Logger(ctx)
// Ensure any lingering child processes are ended before attempting
// to remove the workspace (Windows file locks otherwise prevent cleanup).
e.terminateRunningProcesses(ctx)
// Only removes per-job misc state. Must not remove the cache/toolcache root.
// Bound it: CleanUp is a caller-supplied, typically unbounded os.RemoveAll,
// and a delete stalled by a filesystem filter driver would otherwise hang
// the job forever at "Cleaning up container" and hold the capacity slot.
if e.CleanUp != nil {
logger.Debugf("running host environment cleanup callback")
if err := runWithTimeout(e.CleanUp, hostCleanupTimeout); err != nil {
logger.Warnf("host environment cleanup did not finish within %s; continuing job completion, scratch state may be leaked and is reclaimed by the idle stale-dir sweep", hostCleanupTimeout)
} else {
logger.Debugf("host environment cleanup callback finished")
}
e.CleanUp()
}
// Detach: a cancelled ctx would skip removePathWithRetry's retries,
// which absorb Windows file-handle release lag after the kill above.
rmCtx, rmCancel := context.WithTimeout(context.Background(), hostCleanupTimeout)
rmCtx, rmCancel := context.WithTimeout(context.Background(), 30*time.Second)
defer rmCancel()
logger := common.Logger(ctx)
var errs []error
if err := removePathWithRetry(rmCtx, e.Path); err != nil {
logger.Warnf("failed to remove host misc state %s: %v", e.Path, err)
@@ -620,16 +561,9 @@ func (e *HostEnvironment) Remove() common.Executor {
errs = append(errs, err)
}
}
for _, err := range errs {
if !errors.Is(err, context.DeadlineExceeded) {
return errors.Join(errs...)
}
}
// Bounded teardown timed out; warnings already logged above. Do not
// fail job completion — leaked scratch is reclaimed by the idle sweep.
return nil
}
}
func (e *HostEnvironment) ToContainerPath(path string) string {
if bp, err := filepath.Rel(e.Workdir, path); err != nil {

View File

@@ -15,7 +15,6 @@ import (
"runtime"
"strings"
"testing"
"time"
"gitea.com/gitea/runner/act/common"
@@ -189,118 +188,6 @@ func TestHostEnvironmentRemoveCleansWorkdirWhenOwned(t *testing.T) {
assert.ErrorIs(t, err, os.ErrNotExist)
}
func TestRemoveAllWithContextDoesNotHangOnStuckDelete(t *testing.T) {
release := make(chan struct{})
stubDone := make(chan struct{})
orig := removeAll
removeAll = func(string) error {
defer close(stubDone)
<-release
return nil
}
// removeAllWithContext intentionally leaks the delete goroutine on timeout,
// and that goroutine still references removeAll. Unblock it and wait for it
// to return before restoring the var, so the restore can't race the read.
t.Cleanup(func() {
close(release)
<-stubDone
removeAll = orig
})
ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond)
defer cancel()
err := removeAllWithContext(ctx, t.TempDir())
require.ErrorIs(t, err, context.DeadlineExceeded)
}
// TestHostEnvironmentRemoveDoesNotHangOnStuckCleanUp guards against a stalled
// CleanUp callback (e.g. an os.RemoveAll blocked by an AV/EDR filter driver or
// an unresponsive mount) wedging the runner slot forever at "Cleaning up
// container". Remove must time out the callback and complete job teardown.
func TestHostEnvironmentRemoveDoesNotHangOnStuckCleanUp(t *testing.T) {
// Keep the suite fast: shrink the per-phase teardown timeout for this test.
orig := hostCleanupTimeout
hostCleanupTimeout = 100 * time.Millisecond
t.Cleanup(func() { hostCleanupTimeout = orig })
logger := logrus.New()
ctx := common.WithLogger(context.Background(), logrus.NewEntry(logger))
base := t.TempDir()
path := filepath.Join(base, "misc", "hostexecutor")
require.NoError(t, os.MkdirAll(path, 0o700))
release := make(chan struct{})
t.Cleanup(func() { close(release) }) // unblock the leaked goroutine at test end
e := &HostEnvironment{
Path: path,
CleanUp: func() {
<-release // simulate a delete syscall stuck indefinitely
},
StdOut: os.Stdout,
}
done := make(chan error, 1)
go func() { done <- e.Remove()(ctx) }()
select {
case err := <-done:
require.NoError(t, err)
case <-time.After(10 * time.Second):
t.Fatal("Remove() hung on a stuck CleanUp callback")
}
}
// TestHostEnvironmentRemoveDoesNotHangOnStuckPathRemoval guards against a
// stalled os.RemoveAll on the misc/workspace paths (same AV/EDR wedge as
// #1023) wedging job completion after the CleanUp callback has already timed
// out or finished.
func TestHostEnvironmentRemoveDoesNotHangOnStuckPathRemoval(t *testing.T) {
origTimeout := hostCleanupTimeout
hostCleanupTimeout = 100 * time.Millisecond
t.Cleanup(func() { hostCleanupTimeout = origTimeout })
release := make(chan struct{})
stubDone := make(chan struct{})
origRemoveAll := removeAll
removeAll = func(string) error {
defer close(stubDone)
<-release
return nil
}
// The stuck delete goroutine outlives the timed-out Remove and still reads
// removeAll; unblock it and wait before restoring to avoid a restore/read race.
t.Cleanup(func() {
close(release)
<-stubDone
removeAll = origRemoveAll
})
logger := logrus.New()
ctx := common.WithLogger(context.Background(), logrus.NewEntry(logger))
base := t.TempDir()
path := filepath.Join(base, "misc", "hostexecutor")
require.NoError(t, os.MkdirAll(path, 0o700))
e := &HostEnvironment{
Path: path,
StdOut: os.Stdout,
}
done := make(chan error, 1)
go func() { done <- e.Remove()(ctx) }()
select {
case err := <-done:
require.NoError(t, err)
case <-time.After(10 * time.Second):
t.Fatal("Remove() hung on a stuck path removal")
}
}
func TestBuildWindowsWorkspaceKillScript(t *testing.T) {
t.Run("single dir", func(t *testing.T) {
s := buildWindowsWorkspaceKillScript([]string{`C:\workspace\job1`})

2
go.mod
View File

@@ -36,7 +36,7 @@ require (
github.com/stretchr/testify v1.11.1
github.com/timshannon/bolthold v0.0.0-20240314194003-30aac6950928
go.etcd.io/bbolt v1.4.3
go.yaml.in/yaml/v4 v4.0.0-rc.3
go.yaml.in/yaml/v4 v4.0.0-rc.5
golang.org/x/sys v0.46.0
golang.org/x/term v0.44.0
google.golang.org/protobuf v1.36.11

2
go.sum
View File

@@ -232,6 +232,8 @@ go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc=
go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg=
go.yaml.in/yaml/v4 v4.0.0-rc.3 h1:3h1fjsh1CTAPjW7q/EMe+C8shx5d8ctzZTrLcs/j8Go=
go.yaml.in/yaml/v4 v4.0.0-rc.3/go.mod h1:aZqd9kCMsGL7AuUv/m/PvWLdg5sjJsZ4oHDEnfPPfY0=
go.yaml.in/yaml/v4 v4.0.0-rc.5 h1:JVliQq9EGOYaTgMi+k8BhUJyqcGk4ZqeuiN1Cirba9c=
go.yaml.in/yaml/v4 v4.0.0-rc.5/go.mod h1:aZqd9kCMsGL7AuUv/m/PvWLdg5sjJsZ4oHDEnfPPfY0=
golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4=
golang.org/x/crypto v0.52.0 h1:RMs7fP2rXdep0CftQlK8Uf+kibLm7qkCcradZWYz988=
golang.org/x/crypto v0.52.0/go.mod h1:1QgfPxDqh0T2M/elOJtp9RvuR95kVjir0e6/BvEmGbc=

View File

@@ -127,22 +127,15 @@ func (r *Runner) OnIdle(ctx context.Context) {
if !r.shouldRunIdleCleanup() {
return
}
// Bind-workdir mode: reclaim stale per-task workspace dirs (numeric task IDs).
if r.cfg.Container.BindWorkdir {
workdirParent := strings.TrimLeft(r.cfg.Container.WorkdirParent, "/")
workdirRoot := filepath.FromSlash("/" + workdirParent)
r.cleanupStaleDirs(ctx, workdirRoot, isTaskIDDir)
}
// Host mode: reclaim per-job scratch dirs left behind when HostEnvironment
// cleanup timed out (e.g. a delete stalled by an AV/EDR filter driver). They
// sit under the host workdir parent alongside the shared tool_cache, which
// the name match leaves untouched. No-op when no host-mode job ever ran.
if hostRoot := filepath.FromSlash(r.cfg.Host.WorkdirParent); hostRoot != "" {
r.cleanupStaleDirs(ctx, hostRoot, isHostScratchDir)
}
r.cleanupStaleTaskDirs(ctx, workdirRoot)
}
func (r *Runner) shouldRunIdleCleanup() bool {
if !r.cfg.Container.BindWorkdir {
return false
}
if r.cfg.Runner.WorkdirCleanupAge <= 0 || r.cfg.Runner.IdleCleanupInterval <= 0 {
return false
}
@@ -162,52 +155,18 @@ func (r *Runner) shouldRunIdleCleanup() bool {
}
}
// cleanupStaleTaskDirs reclaims stale bind-workdir per-task directories under
// workdirRoot. Retained as a thin wrapper so existing callers and tests keep a
// stable entry point.
func (r *Runner) cleanupStaleTaskDirs(ctx context.Context, workdirRoot string) {
r.cleanupStaleDirs(ctx, workdirRoot, isTaskIDDir)
}
// isTaskIDDir reports whether name is a per-task workspace dir (numeric task
// ID). Any other directory is skipped to avoid deleting operator-managed data
// under workdir_root.
func isTaskIDDir(name string) bool {
_, err := strconv.ParseUint(name, 10, 64)
return err == nil
}
// isHostScratchDir reports whether name is a per-job host-mode scratch dir:
// hex.EncodeToString of 8 random bytes, i.e. exactly 16 lowercase hex chars
// (see startHostEnvironment in act/runner/run_context.go). The narrow match
// leaves the sibling shared "tool_cache" dir and any operator data untouched.
func isHostScratchDir(name string) bool {
if len(name) != 16 {
return false
}
for _, c := range name {
if (c < '0' || c > '9') && (c < 'a' || c > 'f') {
return false
}
}
return true
}
// cleanupStaleDirs removes immediate child directories of root that match and
// whose mtime is older than WorkdirCleanupAge. It is a no-op when root does not
// exist yet (the runner has never written there).
func (r *Runner) cleanupStaleDirs(ctx context.Context, root string, match func(name string) bool) {
entries, err := os.ReadDir(root)
entries, err := os.ReadDir(workdirRoot)
if err != nil {
if errors.Is(err, os.ErrNotExist) {
return
}
log.Warnf("failed to list directory %s for stale cleanup: %v", root, err)
log.Warnf("failed to list task workspace root %s for stale cleanup: %v", workdirRoot, err)
return
}
// A task may begin between shouldRunIdleCleanup's running-count check and
// the loop below. That is safe because new dirs are created with the
// the loop below. That is safe because new task dirs are created with the
// current mtime and therefore fall on the keep side of cutoff.
cutoff := r.now().Add(-r.cfg.Runner.WorkdirCleanupAge)
for _, entry := range entries {
@@ -217,23 +176,25 @@ func (r *Runner) cleanupStaleDirs(ctx context.Context, root string, match func(n
if !entry.IsDir() {
continue
}
if !match(entry.Name()) {
// Task workspaces are indexed by numeric task IDs; skip any other
// directories to avoid deleting operator-managed data under workdir_root.
if _, err := strconv.ParseUint(entry.Name(), 10, 64); err != nil {
continue
}
info, err := entry.Info()
if err != nil {
log.Warnf("failed to stat %s: %v", filepath.Join(root, entry.Name()), err)
log.Warnf("failed to stat task workspace %s: %v", filepath.Join(workdirRoot, entry.Name()), err)
continue
}
if info.ModTime().After(cutoff) {
continue
}
dir := filepath.Join(root, entry.Name())
if err := os.RemoveAll(dir); err != nil {
log.Warnf("failed to clean stale directory %s: %v", dir, err)
taskDir := filepath.Join(workdirRoot, entry.Name())
if err := os.RemoveAll(taskDir); err != nil {
log.Warnf("failed to clean stale task workspace %s: %v", taskDir, err)
continue
}
log.Infof("cleaned stale directory %s", dir)
log.Infof("cleaned stale task workspace %s", taskDir)
}
}

View File

@@ -52,55 +52,6 @@ func TestRunnerCleanupStaleTaskDirs(t *testing.T) {
assert.DirExists(t, alphaNumericTask)
}
// TestRunnerOnIdleCleansStaleHostScratchDirs covers the host-mode leak path:
// a per-job scratch dir (16 hex chars) left behind by a timed-out cleanup must
// be reclaimed, while the shared tool_cache and operator data are preserved.
func TestRunnerOnIdleCleansStaleHostScratchDirs(t *testing.T) {
now := time.Date(2026, time.April, 29, 20, 0, 0, 0, time.UTC)
hostRoot := filepath.Join(t.TempDir(), "act")
require.NoError(t, os.MkdirAll(hostRoot, 0o700))
staleScratch := filepath.Join(hostRoot, "0123456789abcdef") // 16 hex
freshScratch := filepath.Join(hostRoot, "fedcba9876543210")
toolCache := filepath.Join(hostRoot, "tool_cache")
operatorData := filepath.Join(hostRoot, "keep-me")
for _, path := range []string{staleScratch, freshScratch, toolCache, operatorData} {
require.NoError(t, os.MkdirAll(path, 0o700))
}
require.NoError(t, os.Chtimes(staleScratch, now.Add(-48*time.Hour), now.Add(-48*time.Hour)))
require.NoError(t, os.Chtimes(freshScratch, now.Add(-10*time.Minute), now.Add(-10*time.Minute)))
require.NoError(t, os.Chtimes(toolCache, now.Add(-72*time.Hour), now.Add(-72*time.Hour)))
require.NoError(t, os.Chtimes(operatorData, now.Add(-72*time.Hour), now.Add(-72*time.Hour)))
r := &Runner{
cfg: &config.Config{
Host: config.Host{WorkdirParent: hostRoot},
Runner: config.Runner{
WorkdirCleanupAge: 24 * time.Hour,
IdleCleanupInterval: time.Minute,
},
},
now: func() time.Time { return now },
}
r.OnIdle(context.Background())
assert.NoDirExists(t, staleScratch) // stale scratch reclaimed
assert.DirExists(t, freshScratch) // within cleanup age, kept
assert.DirExists(t, toolCache) // shared cache, never a scratch match
assert.DirExists(t, operatorData) // non-hex name, untouched
}
func TestIsHostScratchDir(t *testing.T) {
assert.True(t, isHostScratchDir("0123456789abcdef"))
assert.True(t, isHostScratchDir("ffffffffffffffff"))
assert.False(t, isHostScratchDir("tool_cache"))
assert.False(t, isHostScratchDir("0123456789ABCDEF")) // hex.EncodeToString is lowercase
assert.False(t, isHostScratchDir("0123456789abcde")) // 15 chars
assert.False(t, isHostScratchDir("0123456789abcdef0")) // 17 chars
assert.False(t, isHostScratchDir("123"))
}
func TestRunnerCleanupStaleTaskDirsMissingRoot(t *testing.T) {
r := &Runner{
cfg: &config.Config{
@@ -184,10 +135,7 @@ func TestRunnerShouldRunIdleCleanupSkipsWhenJobRunning(t *testing.T) {
assert.False(t, r.shouldRunIdleCleanup())
}
// Idle cleanup runs regardless of bind_workdir: host mode (bind_workdir off)
// still leaves per-job scratch dirs that the sweep must reclaim.
func TestRunnerShouldRunIdleCleanupRunsWithoutBindWorkdir(t *testing.T) {
now := time.Date(2026, time.April, 29, 20, 0, 0, 0, time.UTC)
func TestRunnerShouldRunIdleCleanupSkipsWhenBindWorkdirDisabled(t *testing.T) {
r := &Runner{
cfg: &config.Config{
Runner: config.Runner{
@@ -195,10 +143,10 @@ func TestRunnerShouldRunIdleCleanupRunsWithoutBindWorkdir(t *testing.T) {
IdleCleanupInterval: time.Minute,
},
},
now: func() time.Time { return now },
now: time.Now,
}
assert.True(t, r.shouldRunIdleCleanup())
assert.False(t, r.shouldRunIdleCleanup())
}
func TestRunnerShouldRunIdleCleanupSkipsWhenDisabled(t *testing.T) {

View File

@@ -40,12 +40,11 @@ runner:
# The runner uses exponential backoff when idle, increasing the interval up to this maximum.
# Set to 0 or same as fetch_interval to disable backoff.
fetch_interval_max: 5s
# While idle, remove stale bind-workdir task directories and orphaned host-mode
# scratch directories (left behind when a host cleanup delete stalls) older than
# this duration. Setting either workdir_cleanup_age or idle_cleanup_interval to 0
# (or any non-positive value) disables stale-directory cleanup entirely.
# While idle, remove stale bind-workdir task directories older than this duration.
# Setting either workdir_cleanup_age or idle_cleanup_interval to 0 (or any
# non-positive value) disables workdir cleanup entirely.
workdir_cleanup_age: 24h
# Cadence for the idle stale-directory cleanup pass.
# Cadence for the idle stale bind-workdir cleanup pass.
idle_cleanup_interval: 10m
# The base interval for periodic log flush to the Gitea instance.
# Logs may be sent earlier if the buffer reaches log_report_batch_size

View File

@@ -33,8 +33,8 @@ type Runner struct {
FetchTimeout time.Duration `yaml:"fetch_timeout"` // FetchTimeout specifies the timeout duration for fetching resources.
FetchInterval time.Duration `yaml:"fetch_interval"` // FetchInterval specifies the interval duration for fetching resources.
FetchIntervalMax time.Duration `yaml:"fetch_interval_max"` // FetchIntervalMax specifies the maximum backoff interval when idle.
WorkdirCleanupAge time.Duration `yaml:"workdir_cleanup_age"` // WorkdirCleanupAge removes stale bind-workdir task directories and orphaned host-mode scratch dirs older than this duration during idle cleanup.
IdleCleanupInterval time.Duration `yaml:"idle_cleanup_interval"` // IdleCleanupInterval runs stale-directory cleanup periodically while the runner is idle. Set to 0 to disable cleanup cadence.
WorkdirCleanupAge time.Duration `yaml:"workdir_cleanup_age"` // WorkdirCleanupAge removes stale bind-workdir task directories older than this duration during idle cleanup.
IdleCleanupInterval time.Duration `yaml:"idle_cleanup_interval"` // IdleCleanupInterval runs stale bind-workdir cleanup periodically while the runner is idle. Set to 0 to disable cleanup cadence.
LogReportInterval time.Duration `yaml:"log_report_interval"` // LogReportInterval specifies the base interval for periodic log flush.
LogReportMaxLatency time.Duration `yaml:"log_report_max_latency"` // LogReportMaxLatency specifies the max time a log row can wait before being sent.
LogReportBatchSize int `yaml:"log_report_batch_size"` // LogReportBatchSize triggers immediate log flush when buffer reaches this size.