perf: reduce runner-to-server connection load with adaptive reporting and polling

- Replace fixed 1s RunDaemon timer with event-driven select loop using
  separate log (3s) and state (5s) tickers for periodic flush
- Add batch-size threshold (default 100 rows) to flush logs immediately
  during bursty output like npm install
- Add max-latency timer (default 5s) to guarantee single log lines are
  delivered within a bounded time
- Trigger immediate flush on step transitions (start/stop) and job
  result for responsive frontend UX
- Skip ReportLog when no pending rows and ReportState when state is
  unchanged to eliminate no-op HTTP requests
- Replace fixed-rate polling with exponential backoff and jitter to
  prevent thundering herd on idle runners
- Tune HTTP client with MaxIdleConnsPerHost=10 and share a single
  http.Client between Ping and Runner service clients
- Add configurable options: log_report_interval, log_report_max_latency,
  log_report_batch_size, state_report_interval, fetch_interval_max

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Bo-Yi Wu
2026-04-10 22:41:38 +08:00
parent 90c1275f0e
commit ec07b8c00b
8 changed files with 304 additions and 83 deletions

View File

@@ -22,17 +22,23 @@ type Log struct {
// Runner represents the configuration for the runner.
type Runner struct {
File string `yaml:"file"` // File specifies the file path for the runner.
Capacity int `yaml:"capacity"` // Capacity specifies the capacity of the runner.
Envs map[string]string `yaml:"envs"` // Envs stores environment variables for the runner.
EnvFile string `yaml:"env_file"` // EnvFile specifies the path to the file containing environment variables for the runner.
Timeout time.Duration `yaml:"timeout"` // Timeout specifies the duration for runner timeout.
ShutdownTimeout time.Duration `yaml:"shutdown_timeout"` // ShutdownTimeout specifies the duration to wait for running jobs to complete during a shutdown of the runner.
Insecure bool `yaml:"insecure"` // Insecure indicates whether the runner operates in an insecure mode.
FetchTimeout time.Duration `yaml:"fetch_timeout"` // FetchTimeout specifies the timeout duration for fetching resources.
FetchInterval time.Duration `yaml:"fetch_interval"` // FetchInterval specifies the interval duration for fetching resources.
Labels []string `yaml:"labels"` // Labels specify the labels of the runner. Labels are declared on each startup
GithubMirror string `yaml:"github_mirror"` // GithubMirror defines what mirrors should be used when using github
File string `yaml:"file"` // File specifies the file path for the runner.
Capacity int `yaml:"capacity"` // Capacity specifies the capacity of the runner.
Envs map[string]string `yaml:"envs"` // Envs stores environment variables for the runner.
EnvFile string `yaml:"env_file"` // EnvFile specifies the path to the file containing environment variables for the runner.
Timeout time.Duration `yaml:"timeout"` // Timeout specifies the duration for runner timeout.
ShutdownTimeout time.Duration `yaml:"shutdown_timeout"` // ShutdownTimeout specifies the duration to wait for running jobs to complete during a shutdown of the runner.
Insecure bool `yaml:"insecure"` // Insecure indicates whether the runner operates in an insecure mode.
FetchTimeout time.Duration `yaml:"fetch_timeout"` // FetchTimeout specifies the timeout duration for fetching resources.
FetchInterval time.Duration `yaml:"fetch_interval"` // FetchInterval specifies the interval duration for fetching resources.
FetchIntervalMax time.Duration `yaml:"fetch_interval_max"` // FetchIntervalMax specifies the maximum backoff interval when idle.
ReportInterval time.Duration `yaml:"report_interval"` // Deprecated: use LogReportInterval and StateReportInterval instead.
LogReportInterval time.Duration `yaml:"log_report_interval"` // LogReportInterval specifies the base interval for periodic log flush.
LogReportMaxLatency time.Duration `yaml:"log_report_max_latency"` // LogReportMaxLatency specifies the max time a log row can wait before being sent.
LogReportBatchSize int `yaml:"log_report_batch_size"` // LogReportBatchSize triggers immediate log flush when buffer reaches this size.
StateReportInterval time.Duration `yaml:"state_report_interval"` // StateReportInterval specifies the interval for state reporting.
Labels []string `yaml:"labels"` // Labels specify the labels of the runner. Labels are declared on each startup
GithubMirror string `yaml:"github_mirror"` // GithubMirror defines what mirrors should be used when using github
}
// Cache represents the configuration for caching.
@@ -137,6 +143,21 @@ func LoadDefault(file string) (*Config, error) {
if cfg.Runner.FetchInterval <= 0 {
cfg.Runner.FetchInterval = 2 * time.Second
}
if cfg.Runner.FetchIntervalMax <= 0 {
cfg.Runner.FetchIntervalMax = 60 * time.Second
}
if cfg.Runner.LogReportInterval <= 0 {
cfg.Runner.LogReportInterval = 3 * time.Second
}
if cfg.Runner.LogReportMaxLatency <= 0 {
cfg.Runner.LogReportMaxLatency = 5 * time.Second
}
if cfg.Runner.LogReportBatchSize <= 0 {
cfg.Runner.LogReportBatchSize = 100
}
if cfg.Runner.StateReportInterval <= 0 {
cfg.Runner.StateReportInterval = 5 * time.Second
}
// although `container.network_mode` will be deprecated, but we have to be compatible with it for now.
if cfg.Container.NetworkMode != "" && cfg.Container.Network == "" {