mirror of
https://gitea.com/gitea/act_runner.git
synced 2026-05-07 15:53:24 +02:00
fix: Heartbeat ReportState for long-running silent jobs (#852)
Fixes #826.
Regressed in f2d54556 (#819, "perf: reduce runner-to-server connection load with adaptive reporting and polling"). That change added an early-return in `ReportState` whenever there was no state change and no pending outputs, so jobs that produce no log output and no step transitions for many minutes (e.g. a Linux kernel build) stop heartbeating. The server eventually marks the task as orphaned and cancels it while the runner is still executing.
The fix tracks the last successful `UpdateTask` time in an atomic and keeps the no-op skip only while the previous report is younger than `stateReportInterval`. The periodic state ticker fires at exactly `stateReportInterval`, so silent jobs now heartbeat each tick; redundant sends from a `stateNotify` firing right after a tick are still suppressed, preserving the perf intent of #819.
Test added: `TestReporter_StateHeartbeat` asserts the skip path within the interval and the heartbeat path after the interval elapses.
---
This PR was written with the help of Claude Opus 4.7
Reviewed-on: https://gitea.com/gitea/runner/pulls/852
Reviewed-by: Nicolas <bircni@icloud.com>
Reviewed-by: Lunny Xiao <xiaolunwen@gmail.com>
Reviewed-by: ChristopherHX <38043+christopherhx@noreply.gitea.com>
Co-authored-by: silverwind <me@silverwind.io>
Co-committed-by: silverwind <me@silverwind.io>
This commit is contained in:
@@ -10,6 +10,7 @@ import (
|
||||
"regexp"
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"gitea.com/gitea/act_runner/internal/pkg/client"
|
||||
@@ -48,6 +49,10 @@ type Reporter struct {
|
||||
outputs sync.Map
|
||||
daemon chan struct{}
|
||||
|
||||
// Unix-nanos of the last successful UpdateTask. Atomic so the heartbeat
|
||||
// guard in ReportState reads it without contending stateMu.
|
||||
lastReportedAtNanos atomic.Int64
|
||||
|
||||
// Adaptive batching control
|
||||
logReportInterval time.Duration
|
||||
logReportMaxLatency time.Duration
|
||||
@@ -489,8 +494,12 @@ func (r *Reporter) ReportState(reportResult bool) error {
|
||||
|
||||
// Consume stateChanged atomically with the snapshot; restored on error
|
||||
// below so a concurrent Fire() during UpdateTask isn't silently lost.
|
||||
// Heartbeat at stateReportInterval even when nothing changed, so the server
|
||||
// doesn't time out long-running silent jobs as orphaned (#826).
|
||||
last := r.lastReportedAtNanos.Load()
|
||||
withinHeartbeatInterval := last != 0 && time.Since(time.Unix(0, last)) < r.stateReportInterval
|
||||
r.stateMu.Lock()
|
||||
if !reportResult && !r.stateChanged && len(outputs) == 0 {
|
||||
if !reportResult && !r.stateChanged && len(outputs) == 0 && withinHeartbeatInterval {
|
||||
r.stateMu.Unlock()
|
||||
return nil
|
||||
}
|
||||
@@ -517,6 +526,7 @@ func (r *Reporter) ReportState(reportResult bool) error {
|
||||
return err
|
||||
}
|
||||
metrics.ReportStateTotal.WithLabelValues(metrics.LabelResultSuccess).Inc()
|
||||
r.lastReportedAtNanos.Store(time.Now().UnixNano())
|
||||
|
||||
for _, k := range resp.Msg.SentOutputs {
|
||||
r.outputs.Store(k, struct{}{})
|
||||
|
||||
Reference in New Issue
Block a user