fix: prevent RunDaemon from sending completed state before Close sends final logs

Split ReportState into a public method that skips when closed (used by
RunDaemon) and a private reportState that always sends (used by Close).
This prevents the server from deleting ephemeral runners before final
logs are uploaded. Add test reproducing the exact interleaving from #793.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
silverwind
2026-02-19 04:05:54 +01:00
parent 5e139031c6
commit 3e139b7f09
2 changed files with 91 additions and 1 deletions

View File

@@ -261,7 +261,7 @@ func (r *Reporter) Close(lastWords string) error {
if err := r.ReportLog(true); err != nil {
return err
}
return r.ReportState()
return r.reportState()
}, retry.Context(r.ctx))
}
@@ -300,7 +300,18 @@ func (r *Reporter) ReportLog(noMore bool) error {
return nil
}
// ReportState reports the current task state unless the reporter is closed.
func (r *Reporter) ReportState() error {
r.stateMu.RLock()
closed := r.closed
r.stateMu.RUnlock()
if closed {
return nil
}
return r.reportState()
}
func (r *Reporter) reportState() error {
r.clientM.Lock()
defer r.clientM.Unlock()

View File

@@ -5,6 +5,7 @@ package report
import (
"context"
"fmt"
"strings"
"sync"
"testing"
@@ -17,6 +18,7 @@ import (
"github.com/stretchr/testify/mock"
"github.com/stretchr/testify/require"
"google.golang.org/protobuf/types/known/structpb"
"google.golang.org/protobuf/types/known/timestamppb"
"gitea.com/gitea/act_runner/internal/pkg/client/mocks"
)
@@ -198,6 +200,83 @@ func TestReporter_Fire(t *testing.T) {
})
}
// TestReporter_EphemeralRunnerDeletion reproduces the exact scenario from
// https://gitea.com/gitea/act_runner/issues/793:
//
// 1. RunDaemon calls ReportLog(false) — runner is still alive
// 2. Close() updates state to Result=FAILURE (between RunDaemon's ReportLog and ReportState)
// 3. RunDaemon's ReportState() would clone the completed state and send it,
// but the fix makes ReportState return early when closed, preventing this
// 4. Close's ReportLog(true) succeeds because the runner was not deleted
func TestReporter_EphemeralRunnerDeletion(t *testing.T) {
runnerDeleted := false
client := mocks.NewClient(t)
client.On("UpdateLog", mock.Anything, mock.Anything).Return(
func(_ context.Context, req *connect_go.Request[runnerv1.UpdateLogRequest]) (*connect_go.Response[runnerv1.UpdateLogResponse], error) {
if runnerDeleted {
return nil, fmt.Errorf("runner has been deleted")
}
return connect_go.NewResponse(&runnerv1.UpdateLogResponse{
AckIndex: req.Msg.Index + int64(len(req.Msg.Rows)),
}), nil
},
)
client.On("UpdateTask", mock.Anything, mock.Anything).Maybe().Return(
func(_ context.Context, req *connect_go.Request[runnerv1.UpdateTaskRequest]) (*connect_go.Response[runnerv1.UpdateTaskResponse], error) {
// Server deletes ephemeral runner when it receives a completed state
if req.Msg.State != nil && req.Msg.State.Result != runnerv1.Result_RESULT_UNSPECIFIED {
runnerDeleted = true
}
return connect_go.NewResponse(&runnerv1.UpdateTaskResponse{}), nil
},
)
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
taskCtx, err := structpb.NewStruct(map[string]interface{}{})
require.NoError(t, err)
reporter := NewReporter(ctx, cancel, client, &runnerv1.Task{Context: taskCtx})
reporter.ResetSteps(1)
// Fire a log entry to create pending data
assert.NoError(t, reporter.Fire(&log.Entry{
Message: "build output",
Data: log.Fields{"stage": "Main", "stepNumber": 0, "raw_output": true},
}))
// Step 1: RunDaemon calls ReportLog(false) — runner is still alive
assert.NoError(t, reporter.ReportLog(false))
// Step 2: Close() updates state — sets Result=FAILURE and marks steps cancelled.
// In the real race, this happens while RunDaemon is between ReportLog and ReportState.
reporter.stateMu.Lock()
reporter.closed = true
for _, v := range reporter.state.Steps {
if v.Result == runnerv1.Result_RESULT_UNSPECIFIED {
v.Result = runnerv1.Result_RESULT_CANCELLED
}
}
reporter.state.Result = runnerv1.Result_RESULT_FAILURE
reporter.logRows = append(reporter.logRows, &runnerv1.LogRow{
Time: timestamppb.Now(),
Content: "Early termination",
})
reporter.state.StoppedAt = timestamppb.Now()
reporter.stateMu.Unlock()
// Step 3: RunDaemon's ReportState() — with the fix, this returns early
// because closed=true, preventing the server from deleting the runner.
assert.NoError(t, reporter.ReportState())
assert.False(t, runnerDeleted, "runner must not be deleted by RunDaemon's ReportState")
// Step 4: Close's final log upload succeeds because the runner is still alive.
// Flush pending rows first, then send the noMore signal (matching Close's retry behavior).
assert.NoError(t, reporter.ReportLog(false))
err = reporter.ReportLog(true)
assert.NoError(t, err, "final log upload must not fail: runner should not be deleted before Close finishes sending logs")
}
func TestReporter_RunDaemonClose_Race(t *testing.T) {
client := mocks.NewClient(t)
client.On("UpdateLog", mock.Anything, mock.Anything).Return(