fix(container): re-validate cached container id before reuse (#1003)

`containerReference.id` was cached from `Create()` and never re-validated, so a container torn down out-of-band (AutoRemove on an unexpected exit, daemon-side cleanup, sibling-job race in a parallel matrix) left a stale id behind. The next `Copy`/`Exec` then hit the daemon with that dead id and failed the otherwise-successful job with `Could not find the file /var/run/act/ in container <id>`.

`find()` now `ContainerInspect`s the cached id and clears it only on a definitive `NotFound`; transient errors trust the cache so cleanup pipelines don't abort on a daemon blip. Operations that need a live container (`copyContent`/`copyDir`/`CopyTarStream`/`exec`/`GetContainerArchive`) fail fast with a clear `container "<name>" does not exist` instead of the daemon's generic empty-id error.

---
This PR was written with the help of Claude Opus 4.7

---------

Co-authored-by: Nicolas <bircni@icloud.com>
Reviewed-on: https://gitea.com/gitea/runner/pulls/1003
Reviewed-by: Nicolas <bircni@icloud.com>
Co-authored-by: silverwind <2021+silverwind@noreply.gitea.com>
Co-committed-by: silverwind <2021+silverwind@noreply.gitea.com>
This commit is contained in:
silverwind
2026-05-29 22:33:44 +00:00
committed by Nicolas
parent c7c4bd600a
commit 1a41ca8ee7
2 changed files with 148 additions and 2 deletions

View File

@@ -26,6 +26,7 @@ import (
"dario.cat/mergo"
"github.com/Masterminds/semver"
cerrdefs "github.com/containerd/errdefs"
"github.com/docker/cli/cli/compose/loader"
"github.com/docker/cli/cli/connhelper"
"github.com/go-git/go-billy/v5/helper/polyfill"
@@ -152,6 +153,8 @@ func (cr *containerReference) Copy(destPath string, files ...*FileEntry) common.
func (cr *containerReference) CopyDir(destPath, srcPath string, useGitIgnore bool) common.Executor {
return common.NewPipelineExecutor(
common.NewInfoExecutor("docker cp src=%s dst=%s", srcPath, destPath),
cr.connect(),
cr.find(),
cr.copyDir(destPath, srcPath, useGitIgnore),
func(ctx context.Context) error {
// If this fails, then folders have wrong permissions on non root container
@@ -167,6 +170,16 @@ func (cr *containerReference) GetContainerArchive(ctx context.Context, srcPath s
if common.Dryrun(ctx) {
return nil, errors.New("DRYRUN is not supported in GetContainerArchive")
}
// Direct entry point (no pipeline) — revalidate cr.id ourselves.
if err := cr.connect()(ctx); err != nil {
return nil, err
}
if err := cr.find()(ctx); err != nil {
return nil, err
}
if cr.id == "" {
return nil, cr.missingContainerError("get archive %s", srcPath)
}
result, err := cr.cli.CopyFromContainer(ctx, cr.id, client.CopyFromContainerOptions{SourcePath: srcPath})
if err != nil {
return nil, err
@@ -314,10 +327,22 @@ func (cr *containerReference) Close() common.Executor {
}
}
// missingContainerError is the shared "container X does not exist" error
// used by ops that need a live cr.id.
func (cr *containerReference) missingContainerError(format string, args ...any) error {
return fmt.Errorf("container %q does not exist; cannot "+format, append([]any{cr.input.Name}, args...)...)
}
func (cr *containerReference) find() common.Executor {
return func(ctx context.Context) error {
if cr.id != "" {
return nil
// Validate cached id; clear only on definitive NotFound so a
// transient daemon error doesn't abort cleanup pipelines.
_, err := cr.cli.ContainerInspect(ctx, cr.id, client.ContainerInspectOptions{})
if !cerrdefs.IsNotFound(err) {
return nil
}
cr.id = ""
}
containers, err := cr.cli.ContainerList(ctx, client.ContainerListOptions{
All: true,
@@ -335,7 +360,6 @@ func (cr *containerReference) find() common.Executor {
}
}
cr.id = ""
return nil
}
}
@@ -592,6 +616,9 @@ func (cr *containerReference) extractFromImageEnv(env *map[string]string) common
func (cr *containerReference) exec(cmd []string, env map[string]string, user, workdir string) common.Executor {
return func(ctx context.Context) error {
if cr.id == "" {
return cr.missingContainerError("exec %v", cmd)
}
logger := common.Logger(ctx)
// Fix slashes when running on Windows
if runtime.GOOS == "windows" {
@@ -746,6 +773,9 @@ func (cr *containerReference) waitForCommand(ctx context.Context, isTerminal boo
}
func (cr *containerReference) CopyTarStream(ctx context.Context, destPath string, tarStream io.Reader) error {
if cr.id == "" {
return cr.missingContainerError("copy to %s", destPath)
}
// Mkdir
buf := &bytes.Buffer{}
tw := tar.NewWriter(buf)
@@ -779,6 +809,9 @@ func (cr *containerReference) CopyTarStream(ctx context.Context, destPath string
func (cr *containerReference) copyDir(dstPath, srcPath string, useGitIgnore bool) common.Executor {
return func(ctx context.Context) error {
if cr.id == "" {
return cr.missingContainerError("copy directory to %s", dstPath)
}
logger := common.Logger(ctx)
tarFile, err := os.CreateTemp("", "act")
if err != nil {
@@ -853,6 +886,9 @@ func (cr *containerReference) copyDir(dstPath, srcPath string, useGitIgnore bool
func (cr *containerReference) copyContent(dstPath string, files ...*FileEntry) common.Executor {
return func(ctx context.Context) error {
if cr.id == "" {
return cr.missingContainerError("copy to %s", dstPath)
}
logger := common.Logger(ctx)
var buf bytes.Buffer
tw := tar.NewWriter(&buf)

View File

@@ -19,6 +19,7 @@ import (
"gitea.com/gitea/runner/act/common"
cerrdefs "github.com/containerd/errdefs"
"github.com/moby/moby/api/types/container"
mobyclient "github.com/moby/moby/client"
"github.com/sirupsen/logrus/hooks/test"
@@ -98,6 +99,16 @@ func (m *mockDockerClient) CopyToContainer(ctx context.Context, id string, optio
return args.Get(0).(mobyclient.CopyToContainerResult), args.Error(1)
}
func (m *mockDockerClient) ContainerInspect(ctx context.Context, id string, opts mobyclient.ContainerInspectOptions) (mobyclient.ContainerInspectResult, error) {
args := m.Called(ctx, id, opts)
return args.Get(0).(mobyclient.ContainerInspectResult), args.Error(1)
}
func (m *mockDockerClient) ContainerList(ctx context.Context, opts mobyclient.ContainerListOptions) (mobyclient.ContainerListResult, error) {
args := m.Called(ctx, opts)
return args.Get(0).(mobyclient.ContainerListResult), args.Error(1)
}
type endlessReader struct {
io.Reader
}
@@ -298,6 +309,105 @@ func TestDockerCopyTarStreamErrorInMkdir(t *testing.T) {
client.AssertExpectations(t)
}
// find() must drop a stale cached id so later Copy/Exec don't hit the
// daemon with a torn-down container.
func TestFindRevalidatesStaleID(t *testing.T) {
ctx := context.Background()
notFound := cerrdefs.ErrNotFound.WithMessage("No such container")
boom := errors.New("daemon unreachable")
newCR := func(id string) (*containerReference, *mockDockerClient) {
client := &mockDockerClient{}
return &containerReference{id: id, cli: client, input: &NewContainerInput{Name: "job-1"}}, client
}
listOpts := mobyclient.ContainerListOptions{All: true}
inspectOpts := mobyclient.ContainerInspectOptions{}
t.Run("stale id cleared, name lookup empty", func(t *testing.T) {
cr, client := newCR("stale")
client.On("ContainerInspect", ctx, "stale", inspectOpts).Return(mobyclient.ContainerInspectResult{}, notFound)
client.On("ContainerList", ctx, listOpts).Return(mobyclient.ContainerListResult{}, nil)
require.NoError(t, cr.find()(ctx))
assert.Empty(t, cr.id)
client.AssertExpectations(t)
})
t.Run("stale id cleared, name lookup repopulates", func(t *testing.T) {
cr, client := newCR("stale")
client.On("ContainerInspect", ctx, "stale", inspectOpts).Return(mobyclient.ContainerInspectResult{}, notFound)
client.On("ContainerList", ctx, listOpts).Return(mobyclient.ContainerListResult{Items: []container.Summary{
{ID: "other", Names: []string{"/somebody-else"}},
{ID: "fresh", Names: []string{"/job-1"}},
}}, nil)
require.NoError(t, cr.find()(ctx))
assert.Equal(t, "fresh", cr.id)
client.AssertExpectations(t)
})
t.Run("live id kept", func(t *testing.T) {
cr, client := newCR("live")
client.On("ContainerInspect", ctx, "live", inspectOpts).Return(mobyclient.ContainerInspectResult{}, nil)
require.NoError(t, cr.find()(ctx))
assert.Equal(t, "live", cr.id)
client.AssertExpectations(t)
})
t.Run("transient inspect error trusts cache", func(t *testing.T) {
cr, client := newCR("live")
client.On("ContainerInspect", ctx, "live", inspectOpts).Return(mobyclient.ContainerInspectResult{}, boom)
require.NoError(t, cr.find()(ctx))
assert.Equal(t, "live", cr.id)
client.AssertExpectations(t)
})
t.Run("list error propagates", func(t *testing.T) {
cr, client := newCR("")
client.On("ContainerList", ctx, listOpts).Return(mobyclient.ContainerListResult{}, boom)
require.ErrorIs(t, cr.find()(ctx), boom)
client.AssertExpectations(t)
})
}
// Every daemon entry point fails fast with a clear, container-named
// error when no live cr.id is known.
func TestRejectsMissingContainer(t *testing.T) {
ctx := context.Background()
client := &mockDockerClient{}
client.On("ContainerList", ctx, mobyclient.ContainerListOptions{All: true}).Return(mobyclient.ContainerListResult{}, nil)
cr := &containerReference{cli: client, input: &NewContainerInput{Name: "job-1"}}
check := func(op string, err error) {
t.Helper()
require.Error(t, err, op)
assert.Contains(t, err.Error(), `container "job-1" does not exist`, op)
}
check("copyContent", cr.copyContent("/var/run/act", &FileEntry{Name: "x", Mode: 0o644})(ctx))
check("copyDir", cr.copyDir("/var/run/act", "/src", false)(ctx))
check("CopyTarStream", cr.CopyTarStream(ctx, "/var/run/act", &bytes.Buffer{}))
check("exec", cr.exec([]string{"echo"}, nil, "", "")(ctx))
_, err := cr.GetContainerArchive(ctx, "/var/run/act/x")
check("GetContainerArchive", err)
}
// End-to-end: a stale cr.id is cleared, repopulated from name lookup,
// and the Copy completes against the fresh id.
func TestPublicCopyPipelineHandlesStaleID(t *testing.T) {
ctx := context.Background()
client := &mockDockerClient{}
client.On("ContainerInspect", ctx, "stale", mobyclient.ContainerInspectOptions{}).
Return(mobyclient.ContainerInspectResult{}, cerrdefs.ErrNotFound.WithMessage("gone"))
client.On("ContainerList", ctx, mobyclient.ContainerListOptions{All: true}).
Return(mobyclient.ContainerListResult{Items: []container.Summary{
{ID: "fresh", Names: []string{"/job-1"}},
}}, nil)
client.On("CopyToContainer", ctx, "fresh", mock.MatchedBy(func(opts mobyclient.CopyToContainerOptions) bool {
return opts.DestinationPath == "/var/run/act"
})).Return(mobyclient.CopyToContainerResult{}, nil)
cr := &containerReference{id: "stale", cli: client, input: &NewContainerInput{Name: "job-1"}}
require.NoError(t, cr.Copy("/var/run/act", &FileEntry{Name: "x", Mode: 0o644})(ctx))
assert.Equal(t, "fresh", cr.id)
client.AssertExpectations(t)
}
// TestDockerCopyToSymlinkPath is a regression test for gitea/runner#981. Most base images
// symlink /var/run to /run, so copying into /var/run/act traverses that symlink. The broken
// docker 29.5.1 daemon fails the extraction with "mkdirat var/run: file exists" (fixed in