fix(host): bound host-environment cleanup and reclaim leaked scratch dirs (#1024)

Fixes #1023.

## Problem
In Windows host mode, a single stalled delete syscall (AV/EDR filter driver, unresponsive mount, dying disk) wedged the job forever at `Cleaning up container`. `HostEnvironment.Remove()` bounds every teardown phase (`terminateRunningProcesses`, both `removePathWithRetry` calls) except the `CleanUp` callback — an unbounded `os.RemoveAll(miscpath)` assigned in `startHostEnvironment`. The runner then held its capacity slot indefinitely, the task was reaped as a zombie, and there were no diagnostics.

## Fix
- **Bound the cleanup (availability):** `Remove()` now runs `CleanUp` under `hostCleanupTimeout` (30s) via `runWithTimeout`; on timeout it logs a warning and continues job completion. The stuck goroutine is left to finish (a delete syscall can't be interrupted). Added debug logs around the phase.
- **Reclaim the leak (disk hygiene):** a timed-out cleanup can leave a scratch dir behind, so the existing idle stale-dir sweep is extended to also remove orphaned host-mode scratch dirs (16-hex names) under `Host.WorkdirParent`, leaving the shared `tool_cache` and operator data untouched. The `bind_workdir` gate is dropped from `shouldRunIdleCleanup` so host-mode runners run the sweep.

Reviewed-on: https://gitea.com/gitea/runner/pulls/1024
Reviewed-by: Lunny Xiao <xiaolunwen@gmail.com>
This commit is contained in:
Nicolas
2026-06-14 14:14:43 +00:00
parent 56979e6ab8
commit 33e6d1d8ff
6 changed files with 302 additions and 31 deletions

View File

@@ -429,6 +429,24 @@ func (e *HostEnvironment) UpdateFromEnv(srcPath string, env *map[string]string)
return parseEnvFile(e, srcPath, env)
}
// removeAll is the filesystem delete used by removeAllWithContext. A package
// var so tests can substitute a blocking stub without patching os.RemoveAll.
var removeAll = os.RemoveAll
// removeAllWithContext runs removeAll in a goroutine and returns once it
// finishes or ctx is cancelled. On cancellation the goroutine is left running —
// a delete blocked inside a syscall cannot be interrupted (see runWithTimeout).
func removeAllWithContext(ctx context.Context, path string) error {
done := make(chan error, 1)
go func() { done <- removeAll(path) }()
select {
case err := <-done:
return err
case <-ctx.Done():
return ctx.Err()
}
}
func removePathWithRetry(ctx context.Context, path string) error {
if path == "" {
return nil
@@ -448,10 +466,13 @@ func removePathWithRetry(ctx context.Context, path string) error {
case <-time.After(delay):
}
}
lastErr = os.RemoveAll(path)
lastErr = removeAllWithContext(ctx, path)
if lastErr == nil {
return nil
}
if errors.Is(lastErr, context.DeadlineExceeded) {
return lastErr
}
}
return lastErr
}
@@ -533,23 +554,61 @@ func (e *HostEnvironment) terminateRunningProcesses(ctx context.Context) {
}
}
// hostCleanupTimeout bounds each filesystem-teardown phase of the host
// environment so a single stalled delete cannot wedge the runner slot forever.
// A var (not const) so tests can shrink it.
var hostCleanupTimeout = 30 * time.Second
// runWithTimeout runs fn in a goroutine and returns once it finishes or timeout
// elapses, whichever comes first. On timeout the goroutine is left running — an
// os.RemoveAll blocked inside a delete syscall (AV/EDR filter drivers, an
// unresponsive network mount, a dying disk) cannot be interrupted — and
// context.DeadlineExceeded is returned. Leaking the goroutine and the scratch
// state it was deleting is strictly better than blocking the caller forever and
// permanently losing the runner's capacity slot; the leaked scratch dir is
// reclaimed later by the runner's idle stale-dir sweep.
func runWithTimeout(fn func(), timeout time.Duration) error {
done := make(chan struct{})
go func() {
defer close(done)
fn()
}()
timer := time.NewTimer(timeout)
defer timer.Stop()
select {
case <-done:
return nil
case <-timer.C:
return context.DeadlineExceeded
}
}
func (e *HostEnvironment) Remove() common.Executor {
return func(ctx context.Context) error {
logger := common.Logger(ctx)
// Ensure any lingering child processes are ended before attempting
// to remove the workspace (Windows file locks otherwise prevent cleanup).
e.terminateRunningProcesses(ctx)
// Only removes per-job misc state. Must not remove the cache/toolcache root.
// Bound it: CleanUp is a caller-supplied, typically unbounded os.RemoveAll,
// and a delete stalled by a filesystem filter driver would otherwise hang
// the job forever at "Cleaning up container" and hold the capacity slot.
if e.CleanUp != nil {
e.CleanUp()
logger.Debugf("running host environment cleanup callback")
if err := runWithTimeout(e.CleanUp, hostCleanupTimeout); err != nil {
logger.Warnf("host environment cleanup did not finish within %s; continuing job completion, scratch state may be leaked and is reclaimed by the idle stale-dir sweep", hostCleanupTimeout)
} else {
logger.Debugf("host environment cleanup callback finished")
}
}
// Detach: a cancelled ctx would skip removePathWithRetry's retries,
// which absorb Windows file-handle release lag after the kill above.
rmCtx, rmCancel := context.WithTimeout(context.Background(), 30*time.Second)
rmCtx, rmCancel := context.WithTimeout(context.Background(), hostCleanupTimeout)
defer rmCancel()
logger := common.Logger(ctx)
var errs []error
if err := removePathWithRetry(rmCtx, e.Path); err != nil {
logger.Warnf("failed to remove host misc state %s: %v", e.Path, err)
@@ -561,7 +620,14 @@ func (e *HostEnvironment) Remove() common.Executor {
errs = append(errs, err)
}
}
return errors.Join(errs...)
for _, err := range errs {
if !errors.Is(err, context.DeadlineExceeded) {
return errors.Join(errs...)
}
}
// Bounded teardown timed out; warnings already logged above. Do not
// fail job completion — leaked scratch is reclaimed by the idle sweep.
return nil
}
}

View File

@@ -15,6 +15,7 @@ import (
"runtime"
"strings"
"testing"
"time"
"gitea.com/gitea/runner/act/common"
@@ -188,6 +189,118 @@ func TestHostEnvironmentRemoveCleansWorkdirWhenOwned(t *testing.T) {
assert.ErrorIs(t, err, os.ErrNotExist)
}
func TestRemoveAllWithContextDoesNotHangOnStuckDelete(t *testing.T) {
release := make(chan struct{})
stubDone := make(chan struct{})
orig := removeAll
removeAll = func(string) error {
defer close(stubDone)
<-release
return nil
}
// removeAllWithContext intentionally leaks the delete goroutine on timeout,
// and that goroutine still references removeAll. Unblock it and wait for it
// to return before restoring the var, so the restore can't race the read.
t.Cleanup(func() {
close(release)
<-stubDone
removeAll = orig
})
ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond)
defer cancel()
err := removeAllWithContext(ctx, t.TempDir())
require.ErrorIs(t, err, context.DeadlineExceeded)
}
// TestHostEnvironmentRemoveDoesNotHangOnStuckCleanUp guards against a stalled
// CleanUp callback (e.g. an os.RemoveAll blocked by an AV/EDR filter driver or
// an unresponsive mount) wedging the runner slot forever at "Cleaning up
// container". Remove must time out the callback and complete job teardown.
func TestHostEnvironmentRemoveDoesNotHangOnStuckCleanUp(t *testing.T) {
// Keep the suite fast: shrink the per-phase teardown timeout for this test.
orig := hostCleanupTimeout
hostCleanupTimeout = 100 * time.Millisecond
t.Cleanup(func() { hostCleanupTimeout = orig })
logger := logrus.New()
ctx := common.WithLogger(context.Background(), logrus.NewEntry(logger))
base := t.TempDir()
path := filepath.Join(base, "misc", "hostexecutor")
require.NoError(t, os.MkdirAll(path, 0o700))
release := make(chan struct{})
t.Cleanup(func() { close(release) }) // unblock the leaked goroutine at test end
e := &HostEnvironment{
Path: path,
CleanUp: func() {
<-release // simulate a delete syscall stuck indefinitely
},
StdOut: os.Stdout,
}
done := make(chan error, 1)
go func() { done <- e.Remove()(ctx) }()
select {
case err := <-done:
require.NoError(t, err)
case <-time.After(10 * time.Second):
t.Fatal("Remove() hung on a stuck CleanUp callback")
}
}
// TestHostEnvironmentRemoveDoesNotHangOnStuckPathRemoval guards against a
// stalled os.RemoveAll on the misc/workspace paths (same AV/EDR wedge as
// #1023) wedging job completion after the CleanUp callback has already timed
// out or finished.
func TestHostEnvironmentRemoveDoesNotHangOnStuckPathRemoval(t *testing.T) {
origTimeout := hostCleanupTimeout
hostCleanupTimeout = 100 * time.Millisecond
t.Cleanup(func() { hostCleanupTimeout = origTimeout })
release := make(chan struct{})
stubDone := make(chan struct{})
origRemoveAll := removeAll
removeAll = func(string) error {
defer close(stubDone)
<-release
return nil
}
// The stuck delete goroutine outlives the timed-out Remove and still reads
// removeAll; unblock it and wait before restoring to avoid a restore/read race.
t.Cleanup(func() {
close(release)
<-stubDone
removeAll = origRemoveAll
})
logger := logrus.New()
ctx := common.WithLogger(context.Background(), logrus.NewEntry(logger))
base := t.TempDir()
path := filepath.Join(base, "misc", "hostexecutor")
require.NoError(t, os.MkdirAll(path, 0o700))
e := &HostEnvironment{
Path: path,
StdOut: os.Stdout,
}
done := make(chan error, 1)
go func() { done <- e.Remove()(ctx) }()
select {
case err := <-done:
require.NoError(t, err)
case <-time.After(10 * time.Second):
t.Fatal("Remove() hung on a stuck path removal")
}
}
func TestBuildWindowsWorkspaceKillScript(t *testing.T) {
t.Run("single dir", func(t *testing.T) {
s := buildWindowsWorkspaceKillScript([]string{`C:\workspace\job1`})