refactor: use single poller with semaphore-based capacity control

Previously, capacity=N spawned N independent polling goroutines, each
making FetchTask RPCs to the Gitea server concurrently. This caused
unnecessary connection load on the server proportional to the runner's
capacity setting.

Replace the N-goroutine model with a single polling loop that uses a
buffered channel as a semaphore to control concurrent task execution.
The poller acquires a capacity slot before fetching; when at capacity,
it blocks without issuing RPCs. Fetched tasks are dispatched to
independent goroutines that release their slot on completion.

Also fix a pre-existing bug in Shutdown() where the timeout branch
used a blocking receive on p.done instead of a non-blocking select,
which prevented shutdownJobs() from ever being called on timeout.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Bo-Yi Wu
2026-04-16 14:56:13 +08:00
parent 40dcee0991
commit 272a446326
3 changed files with 85 additions and 71 deletions

View File

@@ -37,16 +37,15 @@ type Poller struct {
done chan struct{} done chan struct{}
} }
// workerState holds per-goroutine polling state. Backoff counters are // workerState holds the single poller's backoff state. Consecutive empty or
// per-worker so that with Capacity > 1, N workers each seeing one empty // error responses drive exponential backoff; a successful task fetch resets
// response don't combine into a "consecutive N empty" reading on a shared // both counters so the next poll fires immediately.
// counter and trigger an unnecessarily long backoff.
type workerState struct { type workerState struct {
consecutiveEmpty int64 consecutiveEmpty int64
consecutiveErrors int64 consecutiveErrors int64
// lastBackoff is the last interval reported to the PollBackoffSeconds gauge // lastBackoff is the last interval reported to the PollBackoffSeconds gauge;
// from this worker; used to suppress redundant no-op Set calls when the // used to suppress redundant no-op Set calls when the backoff plateaus
// backoff plateaus (e.g. at FetchIntervalMax). // (e.g. at FetchIntervalMax).
lastBackoff time.Duration lastBackoff time.Duration
} }
@@ -73,22 +72,57 @@ func New(cfg *config.Config, client client.Client, runner *run.Runner) *Poller {
} }
func (p *Poller) Poll() { func (p *Poller) Poll() {
sem := make(chan struct{}, p.cfg.Runner.Capacity)
wg := &sync.WaitGroup{} wg := &sync.WaitGroup{}
for i := 0; i < p.cfg.Runner.Capacity; i++ { s := &workerState{}
wg.Add(1)
go p.poll(wg)
}
wg.Wait()
// signal that we shutdown defer func() {
wg.Wait()
close(p.done) close(p.done)
}()
for {
select {
case sem <- struct{}{}:
case <-p.pollingCtx.Done():
return
}
task, ok := p.fetchTask(p.pollingCtx, s)
if !ok {
<-sem
if !p.waitBackoff(s) {
return
}
continue
}
s.resetBackoff()
wg.Add(1)
go func(t *runnerv1.Task) {
defer wg.Done()
defer func() { <-sem }()
p.runTaskWithRecover(p.jobsCtx, t)
}(task)
}
} }
func (p *Poller) PollOnce() { func (p *Poller) PollOnce() {
p.pollOnce(&workerState{}) defer close(p.done)
s := &workerState{}
// signal that we're done for {
close(p.done) task, ok := p.fetchTask(p.pollingCtx, s)
if !ok {
if !p.waitBackoff(s) {
return
}
continue
}
s.resetBackoff()
p.runTaskWithRecover(p.jobsCtx, task)
return
}
} }
func (p *Poller) Shutdown(ctx context.Context) error { func (p *Poller) Shutdown(ctx context.Context) error {
@@ -101,13 +135,13 @@ func (p *Poller) Shutdown(ctx context.Context) error {
// our timeout for shutting down ran out // our timeout for shutting down ran out
case <-ctx.Done(): case <-ctx.Done():
// when both the timeout fires and the graceful shutdown // Both the timeout and the graceful shutdown may fire
// completed succsfully, this branch of the select may // simultaneously. Do a non-blocking check to avoid forcing
// fire. Do a non-blocking check here against the graceful // a shutdown when graceful already completed.
// shutdown status to avoid sending an error if we don't need to. select {
_, ok := <-p.done case <-p.done:
if !ok {
return nil return nil
default:
} }
// force a shutdown of all running jobs // force a shutdown of all running jobs
@@ -120,18 +154,27 @@ func (p *Poller) Shutdown(ctx context.Context) error {
} }
} }
func (p *Poller) poll(wg *sync.WaitGroup) { func (s *workerState) resetBackoff() {
defer wg.Done() s.consecutiveEmpty = 0
s := &workerState{} s.consecutiveErrors = 0
for { s.lastBackoff = 0
p.pollOnce(s) }
select { // waitBackoff sleeps for the current backoff interval (with jitter).
case <-p.pollingCtx.Done(): // Returns false if the polling context was cancelled during the wait.
return func (p *Poller) waitBackoff(s *workerState) bool {
default: base := p.calculateInterval(s)
continue if base != s.lastBackoff {
metrics.PollBackoffSeconds.Set(base.Seconds())
s.lastBackoff = base
} }
timer := time.NewTimer(addJitter(base))
select {
case <-timer.C:
return true
case <-p.pollingCtx.Done():
timer.Stop()
return false
} }
} }
@@ -167,34 +210,6 @@ func addJitter(d time.Duration) time.Duration {
return d + time.Duration(jitter) return d + time.Duration(jitter)
} }
func (p *Poller) pollOnce(s *workerState) {
for {
task, ok := p.fetchTask(p.pollingCtx, s)
if !ok {
base := p.calculateInterval(s)
if base != s.lastBackoff {
metrics.PollBackoffSeconds.Set(base.Seconds())
s.lastBackoff = base
}
timer := time.NewTimer(addJitter(base))
select {
case <-timer.C:
case <-p.pollingCtx.Done():
timer.Stop()
return
}
continue
}
// Got a task — reset backoff counters for fast subsequent polling.
s.consecutiveEmpty = 0
s.consecutiveErrors = 0
p.runTaskWithRecover(p.jobsCtx, task)
return
}
}
func (p *Poller) runTaskWithRecover(ctx context.Context, task *runnerv1.Task) { func (p *Poller) runTaskWithRecover(ctx context.Context, task *runnerv1.Task) {
defer func() { defer func() {
if r := recover(); r != nil { if r := recover(); r != nil {

View File

@@ -19,11 +19,10 @@ import (
"gitea.com/gitea/act_runner/internal/pkg/config" "gitea.com/gitea/act_runner/internal/pkg/config"
) )
// TestPoller_PerWorkerCounters verifies that each worker maintains its own // TestPoller_WorkerStateCounters verifies that workerState correctly tracks
// backoff counters. With a shared counter, N workers each seeing one empty // consecutive empty responses independently per state instance, and that
// response would inflate the counter to N and trigger an unnecessarily long // fetchTask increments only the relevant counter.
// backoff. With per-worker state, each worker only sees its own count. func TestPoller_WorkerStateCounters(t *testing.T) {
func TestPoller_PerWorkerCounters(t *testing.T) {
client := mocks.NewClient(t) client := mocks.NewClient(t)
client.On("FetchTask", mock.Anything, mock.Anything).Return( client.On("FetchTask", mock.Anything, mock.Anything).Return(
func(_ context.Context, _ *connect_go.Request[runnerv1.FetchTaskRequest]) (*connect_go.Response[runnerv1.FetchTaskResponse], error) { func(_ context.Context, _ *connect_go.Request[runnerv1.FetchTaskRequest]) (*connect_go.Response[runnerv1.FetchTaskResponse], error) {
@@ -77,8 +76,8 @@ func TestPoller_FetchErrorIncrementsErrorsOnly(t *testing.T) {
assert.Equal(t, int64(0), s.consecutiveEmpty) assert.Equal(t, int64(0), s.consecutiveEmpty)
} }
// TestPoller_CalculateInterval verifies the per-worker exponential backoff // TestPoller_CalculateInterval verifies the exponential backoff math is
// math is correctly driven by the worker's own counters. // correctly driven by the workerState counters.
func TestPoller_CalculateInterval(t *testing.T) { func TestPoller_CalculateInterval(t *testing.T) {
cfg, err := config.LoadDefault("") cfg, err := config.LoadDefault("")
require.NoError(t, err) require.NoError(t, err)

View File

@@ -89,7 +89,7 @@ var (
Namespace: Namespace, Namespace: Namespace,
Subsystem: "poll", Subsystem: "poll",
Name: "backoff_seconds", Name: "backoff_seconds",
Help: "Last observed polling backoff interval. With Capacity > 1, reflects whichever worker wrote last.", Help: "Last observed polling backoff interval in seconds.",
}) })
JobsTotal = prometheus.NewCounterVec(prometheus.CounterOpts{ JobsTotal = prometheus.NewCounterVec(prometheus.CounterOpts{