mirror of
https://gitea.com/gitea/act_runner.git
synced 2026-06-15 14:24:22 +02:00
fix(host): bound host-environment cleanup and reclaim leaked scratch dirs (#1024)
Fixes #1023. ## Problem In Windows host mode, a single stalled delete syscall (AV/EDR filter driver, unresponsive mount, dying disk) wedged the job forever at `Cleaning up container`. `HostEnvironment.Remove()` bounds every teardown phase (`terminateRunningProcesses`, both `removePathWithRetry` calls) except the `CleanUp` callback — an unbounded `os.RemoveAll(miscpath)` assigned in `startHostEnvironment`. The runner then held its capacity slot indefinitely, the task was reaped as a zombie, and there were no diagnostics. ## Fix - **Bound the cleanup (availability):** `Remove()` now runs `CleanUp` under `hostCleanupTimeout` (30s) via `runWithTimeout`; on timeout it logs a warning and continues job completion. The stuck goroutine is left to finish (a delete syscall can't be interrupted). Added debug logs around the phase. - **Reclaim the leak (disk hygiene):** a timed-out cleanup can leave a scratch dir behind, so the existing idle stale-dir sweep is extended to also remove orphaned host-mode scratch dirs (16-hex names) under `Host.WorkdirParent`, leaving the shared `tool_cache` and operator data untouched. The `bind_workdir` gate is dropped from `shouldRunIdleCleanup` so host-mode runners run the sweep. Reviewed-on: https://gitea.com/gitea/runner/pulls/1024 Reviewed-by: Lunny Xiao <xiaolunwen@gmail.com>
This commit is contained in:
@@ -15,6 +15,7 @@ import (
|
||||
"runtime"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"gitea.com/gitea/runner/act/common"
|
||||
|
||||
@@ -188,6 +189,118 @@ func TestHostEnvironmentRemoveCleansWorkdirWhenOwned(t *testing.T) {
|
||||
assert.ErrorIs(t, err, os.ErrNotExist)
|
||||
}
|
||||
|
||||
func TestRemoveAllWithContextDoesNotHangOnStuckDelete(t *testing.T) {
|
||||
release := make(chan struct{})
|
||||
stubDone := make(chan struct{})
|
||||
|
||||
orig := removeAll
|
||||
removeAll = func(string) error {
|
||||
defer close(stubDone)
|
||||
<-release
|
||||
return nil
|
||||
}
|
||||
// removeAllWithContext intentionally leaks the delete goroutine on timeout,
|
||||
// and that goroutine still references removeAll. Unblock it and wait for it
|
||||
// to return before restoring the var, so the restore can't race the read.
|
||||
t.Cleanup(func() {
|
||||
close(release)
|
||||
<-stubDone
|
||||
removeAll = orig
|
||||
})
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond)
|
||||
defer cancel()
|
||||
|
||||
err := removeAllWithContext(ctx, t.TempDir())
|
||||
require.ErrorIs(t, err, context.DeadlineExceeded)
|
||||
}
|
||||
|
||||
// TestHostEnvironmentRemoveDoesNotHangOnStuckCleanUp guards against a stalled
|
||||
// CleanUp callback (e.g. an os.RemoveAll blocked by an AV/EDR filter driver or
|
||||
// an unresponsive mount) wedging the runner slot forever at "Cleaning up
|
||||
// container". Remove must time out the callback and complete job teardown.
|
||||
func TestHostEnvironmentRemoveDoesNotHangOnStuckCleanUp(t *testing.T) {
|
||||
// Keep the suite fast: shrink the per-phase teardown timeout for this test.
|
||||
orig := hostCleanupTimeout
|
||||
hostCleanupTimeout = 100 * time.Millisecond
|
||||
t.Cleanup(func() { hostCleanupTimeout = orig })
|
||||
|
||||
logger := logrus.New()
|
||||
ctx := common.WithLogger(context.Background(), logrus.NewEntry(logger))
|
||||
base := t.TempDir()
|
||||
path := filepath.Join(base, "misc", "hostexecutor")
|
||||
require.NoError(t, os.MkdirAll(path, 0o700))
|
||||
|
||||
release := make(chan struct{})
|
||||
t.Cleanup(func() { close(release) }) // unblock the leaked goroutine at test end
|
||||
|
||||
e := &HostEnvironment{
|
||||
Path: path,
|
||||
CleanUp: func() {
|
||||
<-release // simulate a delete syscall stuck indefinitely
|
||||
},
|
||||
StdOut: os.Stdout,
|
||||
}
|
||||
|
||||
done := make(chan error, 1)
|
||||
go func() { done <- e.Remove()(ctx) }()
|
||||
|
||||
select {
|
||||
case err := <-done:
|
||||
require.NoError(t, err)
|
||||
case <-time.After(10 * time.Second):
|
||||
t.Fatal("Remove() hung on a stuck CleanUp callback")
|
||||
}
|
||||
}
|
||||
|
||||
// TestHostEnvironmentRemoveDoesNotHangOnStuckPathRemoval guards against a
|
||||
// stalled os.RemoveAll on the misc/workspace paths (same AV/EDR wedge as
|
||||
// #1023) wedging job completion after the CleanUp callback has already timed
|
||||
// out or finished.
|
||||
func TestHostEnvironmentRemoveDoesNotHangOnStuckPathRemoval(t *testing.T) {
|
||||
origTimeout := hostCleanupTimeout
|
||||
hostCleanupTimeout = 100 * time.Millisecond
|
||||
t.Cleanup(func() { hostCleanupTimeout = origTimeout })
|
||||
|
||||
release := make(chan struct{})
|
||||
stubDone := make(chan struct{})
|
||||
|
||||
origRemoveAll := removeAll
|
||||
removeAll = func(string) error {
|
||||
defer close(stubDone)
|
||||
<-release
|
||||
return nil
|
||||
}
|
||||
// The stuck delete goroutine outlives the timed-out Remove and still reads
|
||||
// removeAll; unblock it and wait before restoring to avoid a restore/read race.
|
||||
t.Cleanup(func() {
|
||||
close(release)
|
||||
<-stubDone
|
||||
removeAll = origRemoveAll
|
||||
})
|
||||
|
||||
logger := logrus.New()
|
||||
ctx := common.WithLogger(context.Background(), logrus.NewEntry(logger))
|
||||
base := t.TempDir()
|
||||
path := filepath.Join(base, "misc", "hostexecutor")
|
||||
require.NoError(t, os.MkdirAll(path, 0o700))
|
||||
|
||||
e := &HostEnvironment{
|
||||
Path: path,
|
||||
StdOut: os.Stdout,
|
||||
}
|
||||
|
||||
done := make(chan error, 1)
|
||||
go func() { done <- e.Remove()(ctx) }()
|
||||
|
||||
select {
|
||||
case err := <-done:
|
||||
require.NoError(t, err)
|
||||
case <-time.After(10 * time.Second):
|
||||
t.Fatal("Remove() hung on a stuck path removal")
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildWindowsWorkspaceKillScript(t *testing.T) {
|
||||
t.Run("single dir", func(t *testing.T) {
|
||||
s := buildWindowsWorkspaceKillScript([]string{`C:\workspace\job1`})
|
||||
|
||||
Reference in New Issue
Block a user