mirror of
https://gitea.com/gitea/act_runner.git
synced 2026-05-08 08:13:25 +02:00
refactor: use single poller with semaphore-based capacity control
Previously, capacity=N spawned N independent polling goroutines, each making FetchTask RPCs to the Gitea server concurrently. This caused unnecessary connection load on the server proportional to the runner's capacity setting. Replace the N-goroutine model with a single polling loop that uses a buffered channel as a semaphore to control concurrent task execution. The poller acquires a capacity slot before fetching; when at capacity, it blocks without issuing RPCs. Fetched tasks are dispatched to independent goroutines that release their slot on completion. Also fix a pre-existing bug in Shutdown() where the timeout branch used a blocking receive on p.done instead of a non-blocking select, which prevented shutdownJobs() from ever being called on timeout. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -37,16 +37,15 @@ type Poller struct {
|
|||||||
done chan struct{}
|
done chan struct{}
|
||||||
}
|
}
|
||||||
|
|
||||||
// workerState holds per-goroutine polling state. Backoff counters are
|
// workerState holds the single poller's backoff state. Consecutive empty or
|
||||||
// per-worker so that with Capacity > 1, N workers each seeing one empty
|
// error responses drive exponential backoff; a successful task fetch resets
|
||||||
// response don't combine into a "consecutive N empty" reading on a shared
|
// both counters so the next poll fires immediately.
|
||||||
// counter and trigger an unnecessarily long backoff.
|
|
||||||
type workerState struct {
|
type workerState struct {
|
||||||
consecutiveEmpty int64
|
consecutiveEmpty int64
|
||||||
consecutiveErrors int64
|
consecutiveErrors int64
|
||||||
// lastBackoff is the last interval reported to the PollBackoffSeconds gauge
|
// lastBackoff is the last interval reported to the PollBackoffSeconds gauge;
|
||||||
// from this worker; used to suppress redundant no-op Set calls when the
|
// used to suppress redundant no-op Set calls when the backoff plateaus
|
||||||
// backoff plateaus (e.g. at FetchIntervalMax).
|
// (e.g. at FetchIntervalMax).
|
||||||
lastBackoff time.Duration
|
lastBackoff time.Duration
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -73,22 +72,57 @@ func New(cfg *config.Config, client client.Client, runner *run.Runner) *Poller {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (p *Poller) Poll() {
|
func (p *Poller) Poll() {
|
||||||
|
sem := make(chan struct{}, p.cfg.Runner.Capacity)
|
||||||
wg := &sync.WaitGroup{}
|
wg := &sync.WaitGroup{}
|
||||||
for i := 0; i < p.cfg.Runner.Capacity; i++ {
|
s := &workerState{}
|
||||||
wg.Add(1)
|
|
||||||
go p.poll(wg)
|
|
||||||
}
|
|
||||||
wg.Wait()
|
|
||||||
|
|
||||||
// signal that we shutdown
|
defer func() {
|
||||||
close(p.done)
|
wg.Wait()
|
||||||
|
close(p.done)
|
||||||
|
}()
|
||||||
|
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case sem <- struct{}{}:
|
||||||
|
case <-p.pollingCtx.Done():
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
task, ok := p.fetchTask(p.pollingCtx, s)
|
||||||
|
if !ok {
|
||||||
|
<-sem
|
||||||
|
if !p.waitBackoff(s) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
s.resetBackoff()
|
||||||
|
|
||||||
|
wg.Add(1)
|
||||||
|
go func(t *runnerv1.Task) {
|
||||||
|
defer wg.Done()
|
||||||
|
defer func() { <-sem }()
|
||||||
|
p.runTaskWithRecover(p.jobsCtx, t)
|
||||||
|
}(task)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *Poller) PollOnce() {
|
func (p *Poller) PollOnce() {
|
||||||
p.pollOnce(&workerState{})
|
defer close(p.done)
|
||||||
|
s := &workerState{}
|
||||||
// signal that we're done
|
for {
|
||||||
close(p.done)
|
task, ok := p.fetchTask(p.pollingCtx, s)
|
||||||
|
if !ok {
|
||||||
|
if !p.waitBackoff(s) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
s.resetBackoff()
|
||||||
|
p.runTaskWithRecover(p.jobsCtx, task)
|
||||||
|
return
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *Poller) Shutdown(ctx context.Context) error {
|
func (p *Poller) Shutdown(ctx context.Context) error {
|
||||||
@@ -101,13 +135,13 @@ func (p *Poller) Shutdown(ctx context.Context) error {
|
|||||||
|
|
||||||
// our timeout for shutting down ran out
|
// our timeout for shutting down ran out
|
||||||
case <-ctx.Done():
|
case <-ctx.Done():
|
||||||
// when both the timeout fires and the graceful shutdown
|
// Both the timeout and the graceful shutdown may fire
|
||||||
// completed succsfully, this branch of the select may
|
// simultaneously. Do a non-blocking check to avoid forcing
|
||||||
// fire. Do a non-blocking check here against the graceful
|
// a shutdown when graceful already completed.
|
||||||
// shutdown status to avoid sending an error if we don't need to.
|
select {
|
||||||
_, ok := <-p.done
|
case <-p.done:
|
||||||
if !ok {
|
|
||||||
return nil
|
return nil
|
||||||
|
default:
|
||||||
}
|
}
|
||||||
|
|
||||||
// force a shutdown of all running jobs
|
// force a shutdown of all running jobs
|
||||||
@@ -120,18 +154,27 @@ func (p *Poller) Shutdown(ctx context.Context) error {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *Poller) poll(wg *sync.WaitGroup) {
|
func (s *workerState) resetBackoff() {
|
||||||
defer wg.Done()
|
s.consecutiveEmpty = 0
|
||||||
s := &workerState{}
|
s.consecutiveErrors = 0
|
||||||
for {
|
s.lastBackoff = 0
|
||||||
p.pollOnce(s)
|
}
|
||||||
|
|
||||||
select {
|
// waitBackoff sleeps for the current backoff interval (with jitter).
|
||||||
case <-p.pollingCtx.Done():
|
// Returns false if the polling context was cancelled during the wait.
|
||||||
return
|
func (p *Poller) waitBackoff(s *workerState) bool {
|
||||||
default:
|
base := p.calculateInterval(s)
|
||||||
continue
|
if base != s.lastBackoff {
|
||||||
}
|
metrics.PollBackoffSeconds.Set(base.Seconds())
|
||||||
|
s.lastBackoff = base
|
||||||
|
}
|
||||||
|
timer := time.NewTimer(addJitter(base))
|
||||||
|
select {
|
||||||
|
case <-timer.C:
|
||||||
|
return true
|
||||||
|
case <-p.pollingCtx.Done():
|
||||||
|
timer.Stop()
|
||||||
|
return false
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -167,34 +210,6 @@ func addJitter(d time.Duration) time.Duration {
|
|||||||
return d + time.Duration(jitter)
|
return d + time.Duration(jitter)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *Poller) pollOnce(s *workerState) {
|
|
||||||
for {
|
|
||||||
task, ok := p.fetchTask(p.pollingCtx, s)
|
|
||||||
if !ok {
|
|
||||||
base := p.calculateInterval(s)
|
|
||||||
if base != s.lastBackoff {
|
|
||||||
metrics.PollBackoffSeconds.Set(base.Seconds())
|
|
||||||
s.lastBackoff = base
|
|
||||||
}
|
|
||||||
timer := time.NewTimer(addJitter(base))
|
|
||||||
select {
|
|
||||||
case <-timer.C:
|
|
||||||
case <-p.pollingCtx.Done():
|
|
||||||
timer.Stop()
|
|
||||||
return
|
|
||||||
}
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
// Got a task — reset backoff counters for fast subsequent polling.
|
|
||||||
s.consecutiveEmpty = 0
|
|
||||||
s.consecutiveErrors = 0
|
|
||||||
|
|
||||||
p.runTaskWithRecover(p.jobsCtx, task)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (p *Poller) runTaskWithRecover(ctx context.Context, task *runnerv1.Task) {
|
func (p *Poller) runTaskWithRecover(ctx context.Context, task *runnerv1.Task) {
|
||||||
defer func() {
|
defer func() {
|
||||||
if r := recover(); r != nil {
|
if r := recover(); r != nil {
|
||||||
|
|||||||
@@ -19,11 +19,10 @@ import (
|
|||||||
"github.com/stretchr/testify/require"
|
"github.com/stretchr/testify/require"
|
||||||
)
|
)
|
||||||
|
|
||||||
// TestPoller_PerWorkerCounters verifies that each worker maintains its own
|
// TestPoller_WorkerStateCounters verifies that workerState correctly tracks
|
||||||
// backoff counters. With a shared counter, N workers each seeing one empty
|
// consecutive empty responses independently per state instance, and that
|
||||||
// response would inflate the counter to N and trigger an unnecessarily long
|
// fetchTask increments only the relevant counter.
|
||||||
// backoff. With per-worker state, each worker only sees its own count.
|
func TestPoller_WorkerStateCounters(t *testing.T) {
|
||||||
func TestPoller_PerWorkerCounters(t *testing.T) {
|
|
||||||
client := mocks.NewClient(t)
|
client := mocks.NewClient(t)
|
||||||
client.On("FetchTask", mock.Anything, mock.Anything).Return(
|
client.On("FetchTask", mock.Anything, mock.Anything).Return(
|
||||||
func(_ context.Context, _ *connect_go.Request[runnerv1.FetchTaskRequest]) (*connect_go.Response[runnerv1.FetchTaskResponse], error) {
|
func(_ context.Context, _ *connect_go.Request[runnerv1.FetchTaskRequest]) (*connect_go.Response[runnerv1.FetchTaskResponse], error) {
|
||||||
@@ -77,8 +76,8 @@ func TestPoller_FetchErrorIncrementsErrorsOnly(t *testing.T) {
|
|||||||
assert.Equal(t, int64(0), s.consecutiveEmpty)
|
assert.Equal(t, int64(0), s.consecutiveEmpty)
|
||||||
}
|
}
|
||||||
|
|
||||||
// TestPoller_CalculateInterval verifies the per-worker exponential backoff
|
// TestPoller_CalculateInterval verifies the exponential backoff math is
|
||||||
// math is correctly driven by the worker's own counters.
|
// correctly driven by the workerState counters.
|
||||||
func TestPoller_CalculateInterval(t *testing.T) {
|
func TestPoller_CalculateInterval(t *testing.T) {
|
||||||
cfg, err := config.LoadDefault("")
|
cfg, err := config.LoadDefault("")
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
|
|||||||
@@ -89,7 +89,7 @@ var (
|
|||||||
Namespace: Namespace,
|
Namespace: Namespace,
|
||||||
Subsystem: "poll",
|
Subsystem: "poll",
|
||||||
Name: "backoff_seconds",
|
Name: "backoff_seconds",
|
||||||
Help: "Last observed polling backoff interval. With Capacity > 1, reflects whichever worker wrote last.",
|
Help: "Last observed polling backoff interval in seconds.",
|
||||||
})
|
})
|
||||||
|
|
||||||
JobsTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
|
JobsTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
|
||||||
|
|||||||
Reference in New Issue
Block a user