fix: serialize action-cache reads to prevent worktree race (#938)

`NewGitCloneExecutor` holds a per-directory mutex while it `git checkout --force`s a remote action into the shared `<ActionCacheDir>/<UsesHash>`, but four read sites ran unlocked:

- `maybeCopyToActionDir`'s tar walk via `JobContainer.CopyDir`
- `prepareActionExecutor`'s `readAction` parse of `action.yml`
- `newReusableWorkflowExecutor`'s `model.NewWorkflowPlanner` after `cloneRemoteReusableWorkflow` released its lock
- `execAsDocker` when `ActionCache == nil`: `docker build` walks `contextDir` for the daemon-side build context

When two matrix jobs share a `uses:`, a read interleaved with a peer's checkout produces partial state — observed as `Cannot find module .../dist/index.js` and `setup-uv` failing on a half-written `action.yml`.

Exports `acquireCloneLock` as `AcquireCloneLock` and takes it at all four sites. `container.ImageExistsLocally` / `NewDockerBuildExecutor` and `model.NewWorkflowPlanner` are indirected through package-level vars so the docker-action build path and the reusable-workflow read site are testable without a real daemon, mirroring `ContainerNewContainer`. Three regression tests cover the higher-risk sites (`maybeCopyToActionDir`, `execAsDocker`, `newReusableWorkflowExecutor`); each fails if its `AcquireCloneLock` is removed.

Subsumed by https://gitea.com/gitea/runner/pulls/814 once that lands. Related: https://gitea.com/gitea/runner/pulls/930

---
This PR was written with the help of Claude Opus 4.7

---------

Co-authored-by: Nicolas <bircni@icloud.com>
Reviewed-on: https://gitea.com/gitea/runner/pulls/938
Reviewed-by: Nicolas <bircni@icloud.com>
Reviewed-by: Zettat123 <39446+zettat123@noreply.gitea.com>
Co-authored-by: silverwind <me@silverwind.io>
Co-committed-by: silverwind <me@silverwind.io>
This commit is contained in:
silverwind
2026-05-07 19:57:04 +00:00
committed by silverwind
parent 75643645f0
commit cce8543d06
7 changed files with 247 additions and 14 deletions

View File

@@ -5,11 +5,15 @@ package runner
import (
"context"
"errors"
"os"
"os/exec"
"path/filepath"
"sync"
"testing"
"time"
"gitea.com/gitea/runner/act/common/git"
"gitea.com/gitea/runner/act/model"
"github.com/stretchr/testify/require"
@@ -71,6 +75,54 @@ func TestReusableWorkflowCachedBranchRefRefreshes(t *testing.T) {
require.Equal(t, tmpl("v2"), string(got), "cached workflow file must reflect the updated branch tip")
}
func TestNewReusableWorkflowExecutorHoldsCloneLock(t *testing.T) {
workflowDir := t.TempDir()
unlockOnce := sync.OnceFunc(git.AcquireCloneLock(workflowDir))
defer unlockOnce()
plannerCalled := make(chan struct{})
origPlanner := modelNewWorkflowPlanner
modelNewWorkflowPlanner = func(string, bool) (model.WorkflowPlanner, error) {
close(plannerCalled)
return nil, errors.New("stop")
}
defer func() { modelNewWorkflowPlanner = origPlanner }()
rc := &RunContext{
Config: &Config{},
Run: &model.Run{Workflow: &model.Workflow{Jobs: map[string]*model.Job{}}},
}
exec := newReusableWorkflowExecutor(rc, workflowDir, "reusable.yml")
done := make(chan error, 1)
go func() { done <- exec(context.Background()) }()
select {
case <-plannerCalled:
t.Fatal("planner ran while clone lock was held")
case err := <-done:
t.Fatalf("executor returned before planner was reached: %v", err)
case <-time.After(50 * time.Millisecond):
}
unlockOnce()
select {
case <-plannerCalled:
case <-time.After(time.Second):
t.Fatal("planner not called after lock was released")
}
select {
case err := <-done:
require.Error(t, err)
case <-time.After(time.Second):
t.Fatal("executor did not return after planner ran")
}
}
func gitMust(t *testing.T, dir string, args ...string) {
t.Helper()
cmd := exec.Command("git", args...)