fix: serialize action-cache reads to prevent worktree race (#938)

`NewGitCloneExecutor` holds a per-directory mutex while it `git checkout --force`s a remote action into the shared `<ActionCacheDir>/<UsesHash>`, but four read sites ran unlocked:

- `maybeCopyToActionDir`'s tar walk via `JobContainer.CopyDir`
- `prepareActionExecutor`'s `readAction` parse of `action.yml`
- `newReusableWorkflowExecutor`'s `model.NewWorkflowPlanner` after `cloneRemoteReusableWorkflow` released its lock
- `execAsDocker` when `ActionCache == nil`: `docker build` walks `contextDir` for the daemon-side build context

When two matrix jobs share a `uses:`, a read interleaved with a peer's checkout produces partial state — observed as `Cannot find module .../dist/index.js` and `setup-uv` failing on a half-written `action.yml`.

Exports `acquireCloneLock` as `AcquireCloneLock` and takes it at all four sites. `container.ImageExistsLocally` / `NewDockerBuildExecutor` and `model.NewWorkflowPlanner` are indirected through package-level vars so the docker-action build path and the reusable-workflow read site are testable without a real daemon, mirroring `ContainerNewContainer`. Three regression tests cover the higher-risk sites (`maybeCopyToActionDir`, `execAsDocker`, `newReusableWorkflowExecutor`); each fails if its `AcquireCloneLock` is removed.

Subsumed by https://gitea.com/gitea/runner/pulls/814 once that lands. Related: https://gitea.com/gitea/runner/pulls/930

---
This PR was written with the help of Claude Opus 4.7

---------

Co-authored-by: Nicolas <bircni@icloud.com>
Reviewed-on: https://gitea.com/gitea/runner/pulls/938
Reviewed-by: Nicolas <bircni@icloud.com>
Reviewed-by: Zettat123 <39446+zettat123@noreply.gitea.com>
Co-authored-by: silverwind <me@silverwind.io>
Co-committed-by: silverwind <me@silverwind.io>
This commit is contained in:
silverwind
2026-05-07 19:57:04 +00:00
committed by silverwind
parent 75643645f0
commit cce8543d06
7 changed files with 247 additions and 14 deletions

View File

@@ -9,8 +9,13 @@ import (
"io"
"io/fs"
"strings"
"sync"
"testing"
"time"
"gitea.com/gitea/runner/act/common"
"gitea.com/gitea/runner/act/common/git"
"gitea.com/gitea/runner/act/container"
"gitea.com/gitea/runner/act/model"
"github.com/stretchr/testify/assert"
@@ -252,3 +257,153 @@ func TestActionRunner(t *testing.T) {
})
}
}
func TestMaybeCopyToActionDirHoldsCloneLock(t *testing.T) {
ctx := context.Background()
actionDir := t.TempDir()
releaseCopy := make(chan struct{})
release := sync.OnceFunc(func() { close(releaseCopy) })
defer release()
copyEntered := make(chan struct{})
cm := &containerMock{}
cm.On("CopyDir", "/var/run/act/actions/", actionDir+"/", false).Return(func(ctx context.Context) error {
close(copyEntered)
<-releaseCopy
return nil
})
step := &stepActionRemote{
Step: &model.Step{Uses: "remote/action@v1"},
RunContext: &RunContext{
Config: &Config{},
JobContainer: cm,
},
}
copyDone := make(chan error, 1)
go func() {
copyDone <- maybeCopyToActionDir(ctx, step, actionDir, "", "/var/run/act/actions/")
}()
select {
case <-copyEntered:
case err := <-copyDone:
t.Fatalf("maybeCopyToActionDir returned before CopyDir was entered: %v", err)
case <-time.After(time.Second):
t.Fatal("CopyDir was not entered within 1 second")
}
peerAcquired := make(chan struct{})
go func() {
unlock := git.AcquireCloneLock(actionDir)
close(peerAcquired)
unlock()
}()
select {
case <-peerAcquired:
t.Fatal("peer AcquireCloneLock returned while CopyDir was running")
case <-time.After(50 * time.Millisecond):
}
release()
select {
case err := <-copyDone:
if err != nil {
t.Fatalf("maybeCopyToActionDir returned error: %v", err)
}
case <-time.After(time.Second):
t.Fatal("maybeCopyToActionDir did not return after CopyDir was unblocked")
}
select {
case <-peerAcquired:
case <-time.After(time.Second):
t.Fatal("peer AcquireCloneLock did not proceed after lock released")
}
cm.AssertExpectations(t)
}
func TestExecAsDockerHoldsCloneLockForRemoteUncached(t *testing.T) {
actionDir := t.TempDir()
unlockOnce := sync.OnceFunc(git.AcquireCloneLock(actionDir))
defer unlockOnce()
innerEntered := make(chan struct{})
releaseInner := make(chan struct{})
releaseOnce := sync.OnceFunc(func() { close(releaseInner) })
defer releaseOnce()
origImageExists := ContainerImageExistsLocally
ContainerImageExistsLocally = func(_ context.Context, _, _ string) (bool, error) {
return false, nil
}
defer func() { ContainerImageExistsLocally = origImageExists }()
origBuildExec := ContainerNewDockerBuildExecutor
ContainerNewDockerBuildExecutor = func(_ container.NewDockerBuildExecutorInput) common.Executor {
return func(_ context.Context) error {
close(innerEntered)
<-releaseInner
return nil
}
}
defer func() { ContainerNewDockerBuildExecutor = origBuildExec }()
step := &stepActionRemote{
Step: &model.Step{ID: "1", Uses: "remote/action@v1", With: map[string]string{}},
RunContext: &RunContext{
Config: &Config{},
Run: &model.Run{
JobID: "1",
Workflow: &model.Workflow{
Name: "wf",
Jobs: map[string]*model.Job{"1": {}},
},
},
JobContainer: &containerMock{},
},
action: &model.Action{Runs: model.ActionRuns{Using: "docker", Image: "Dockerfile"}},
env: map[string]string{},
}
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
done := make(chan error, 1)
go func() { done <- execAsDocker(ctx, step, "test-action", actionDir, actionDir, false) }()
select {
case <-innerEntered:
t.Fatal("inner build executor ran before clone lock was released")
case err := <-done:
t.Fatalf("execAsDocker returned before inner was entered: %v", err)
case <-time.After(50 * time.Millisecond):
}
unlockOnce()
select {
case <-innerEntered:
case err := <-done:
t.Fatalf("execAsDocker returned without entering inner: %v", err)
case <-time.After(time.Second):
t.Fatal("inner build executor not entered after lock released")
}
cancel()
releaseOnce()
select {
case <-done:
case <-time.After(time.Second):
t.Fatal("execAsDocker did not return after inner was released and ctx was canceled")
}
}