From 73b14405d72884a6e69b36985213a0da3422dcce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20K=C3=A4ppler?= <mkaeppler@gitlab.com> Date: Tue, 22 Oct 2019 10:44:51 +0000 Subject: [PATCH] More fine-grained metrics for sidekiq workers We currently only measure the global (node-level) business in terms of jobs being processed. We would instead like to be able to know for each sidekiq process how saturated it is in terms of actual concurrency vs requested/max concurrency. This patch does the following: - add a worker/process dimension to all existing metrics - introduce a new metric to observe concurrency per process by relating "actual job count per process" to "max job count per process" we can then obtain some sort of saturation metric. --- ee/lib/gitlab/sidekiq_cluster.rb | 11 +++-- ee/lib/gitlab/sidekiq_cluster/cli.rb | 3 +- .../lib/gitlab/sidekiq_cluster/cli_spec.rb | 11 +++-- ee/spec/lib/gitlab/sidekiq_cluster_spec.rb | 40 +++++++++------- lib/gitlab/sidekiq_middleware/metrics.rb | 5 +- lib/prometheus/pid_provider.rb | 10 +++- .../gitlab/sidekiq_middleware/metrics_spec.rb | 46 ++++++++++++------- spec/lib/prometheus/pid_provider_spec.rb | 12 ++++- 8 files changed, 93 insertions(+), 45 deletions(-) diff --git a/ee/lib/gitlab/sidekiq_cluster.rb b/ee/lib/gitlab/sidekiq_cluster.rb index 4ffa9b4d5b3e..7b2d8ce7a554 100644 --- a/ee/lib/gitlab/sidekiq_cluster.rb +++ b/ee/lib/gitlab/sidekiq_cluster.rb @@ -64,14 +64,16 @@ def self.parse_queues(array) # directory - The directory of the Rails application. # # Returns an Array containing the PIDs of the started processes. - def self.start(queues, env, directory = Dir.pwd, max_concurrency = 50, dryrun: false) - queues.map { |pair| start_sidekiq(pair, env, directory, max_concurrency, dryrun: dryrun) } + def self.start(queues, env: :development, directory: Dir.pwd, max_concurrency: 50, dryrun: false) + queues.map.with_index do |pair, index| + start_sidekiq(pair, env: env, directory: directory, max_concurrency: max_concurrency, worker_id: index, dryrun: dryrun) + end end # Starts a Sidekiq process that processes _only_ the given queues. # # Returns the PID of the started process. - def self.start_sidekiq(queues, env, directory = Dir.pwd, max_concurrency = 50, dryrun: false) + def self.start_sidekiq(queues, env:, directory:, max_concurrency:, worker_id:, dryrun:) counts = count_by_queue(queues) cmd = %w[bundle exec sidekiq] @@ -90,7 +92,8 @@ def self.start_sidekiq(queues, env, directory = Dir.pwd, max_concurrency = 50, d end pid = Process.spawn( - { 'ENABLE_SIDEKIQ_CLUSTER' => '1' }, + { 'ENABLE_SIDEKIQ_CLUSTER' => '1', + 'SIDEKIQ_WORKER_ID' => worker_id.to_s }, *cmd, pgroup: true, err: $stderr, diff --git a/ee/lib/gitlab/sidekiq_cluster/cli.rb b/ee/lib/gitlab/sidekiq_cluster/cli.rb index d39bf0d6dcf3..a44b8dc801ba 100644 --- a/ee/lib/gitlab/sidekiq_cluster/cli.rb +++ b/ee/lib/gitlab/sidekiq_cluster/cli.rb @@ -49,7 +49,8 @@ def run(argv = ARGV) @logger.info("Starting cluster with #{queue_groups.length} processes") - @processes = SidekiqCluster.start(queue_groups, @environment, @rails_path, @max_concurrency, dryrun: @dryrun) + @processes = SidekiqCluster.start(queue_groups, env: @environment, directory: @rails_path, + max_concurrency: @max_concurrency, dryrun: @dryrun) return if @dryrun diff --git a/ee/spec/lib/gitlab/sidekiq_cluster/cli_spec.rb b/ee/spec/lib/gitlab/sidekiq_cluster/cli_spec.rb index d450b5872be8..899ad964498c 100644 --- a/ee/spec/lib/gitlab/sidekiq_cluster/cli_spec.rb +++ b/ee/spec/lib/gitlab/sidekiq_cluster/cli_spec.rb @@ -4,6 +4,9 @@ describe Gitlab::SidekiqCluster::CLI do let(:cli) { described_class.new('/dev/null') } + let(:default_options) do + { env: 'test', directory: Dir.pwd, max_concurrency: 50, dryrun: false } + end describe '#run' do context 'without any arguments' do @@ -21,7 +24,7 @@ it 'starts the Sidekiq workers' do expect(Gitlab::SidekiqCluster).to receive(:start) - .with([['foo']], 'test', Dir.pwd, 50, dryrun: false) + .with([['foo']], default_options) .and_return([]) cli.run(%w(foo)) @@ -31,7 +34,7 @@ it 'starts Sidekiq workers for all queues in all_queues.yml except the ones in argv' do expect(Gitlab::SidekiqConfig).to receive(:worker_queues).and_return(['baz']) expect(Gitlab::SidekiqCluster).to receive(:start) - .with([['baz']], 'test', Dir.pwd, 50, dryrun: false) + .with([['baz']], default_options) .and_return([]) cli.run(%w(foo -n)) @@ -42,7 +45,7 @@ it 'starts Sidekiq workers for specified queues with a max concurrency' do expect(Gitlab::SidekiqConfig).to receive(:worker_queues).and_return(%w(foo bar baz)) expect(Gitlab::SidekiqCluster).to receive(:start) - .with([%w(foo bar baz), %w(solo)], 'test', Dir.pwd, 2, dryrun: false) + .with([%w(foo bar baz), %w(solo)], default_options.merge(max_concurrency: 2)) .and_return([]) cli.run(%w(foo,bar,baz solo -m 2)) @@ -53,7 +56,7 @@ it 'starts Sidekiq workers for all queues in all_queues.yml with a namespace in argv' do expect(Gitlab::SidekiqConfig).to receive(:worker_queues).and_return(['cronjob:foo', 'cronjob:bar']) expect(Gitlab::SidekiqCluster).to receive(:start) - .with([['cronjob', 'cronjob:foo', 'cronjob:bar']], 'test', Dir.pwd, 50, dryrun: false) + .with([['cronjob', 'cronjob:foo', 'cronjob:bar']], default_options) .and_return([]) cli.run(%w(cronjob)) diff --git a/ee/spec/lib/gitlab/sidekiq_cluster_spec.rb b/ee/spec/lib/gitlab/sidekiq_cluster_spec.rb index 5f35c79fbc6e..8447e8ea80d5 100644 --- a/ee/spec/lib/gitlab/sidekiq_cluster_spec.rb +++ b/ee/spec/lib/gitlab/sidekiq_cluster_spec.rb @@ -58,36 +58,44 @@ end describe '.start' do - it 'starts Sidekiq with the given queues and environment' do - expect(described_class).to receive(:start_sidekiq) - .ordered.with(%w(foo), :production, 'foo/bar', 50, dryrun: false) + it 'starts Sidekiq with the given queues, environment and options' do + expected_options = { env: :production, directory: 'foo/bar', max_concurrency: 20, dryrun: true } - expect(described_class).to receive(:start_sidekiq) - .ordered.with(%w(bar baz), :production, 'foo/bar', 50, dryrun: false) + expect(described_class).to receive(:start_sidekiq).ordered.with(%w(foo), expected_options.merge(worker_id: 0)) + expect(described_class).to receive(:start_sidekiq).ordered.with(%w(bar baz), expected_options.merge(worker_id: 1)) - described_class.start([%w(foo), %w(bar baz)], :production, 'foo/bar', 50) + described_class.start([%w(foo), %w(bar baz)], env: :production, directory: 'foo/bar', max_concurrency: 20, dryrun: true) end - it 'starts Sidekiq with capped concurrency limits for each queue' do - expect(described_class).to receive(:start_sidekiq) - .ordered.with(%w(foo bar baz), :production, 'foo/bar', 2, dryrun: false) + it 'starts Sidekiq with the given queues and sensible default options' do + expected_options = { + env: :development, + directory: an_instance_of(String), + max_concurrency: 50, + worker_id: an_instance_of(Integer), + dryrun: false + } - expect(described_class).to receive(:start_sidekiq) - .ordered.with(%w(solo), :production, 'foo/bar', 2, dryrun: false) + expect(described_class).to receive(:start_sidekiq).ordered.with(%w(foo bar baz), expected_options) + expect(described_class).to receive(:start_sidekiq).ordered.with(%w(solo), expected_options) - described_class.start([%w(foo bar baz), %w(solo)], :production, 'foo/bar', 2) + described_class.start([%w(foo bar baz), %w(solo)]) end end describe '.start_sidekiq' do - let(:env) { { "ENABLE_SIDEKIQ_CLUSTER" => "1" } } + let(:first_worker_id) { 0 } + let(:options) do + { env: :production, directory: 'foo/bar', max_concurrency: 20, worker_id: first_worker_id, dryrun: false } + end + let(:env) { { "ENABLE_SIDEKIQ_CLUSTER" => "1", "SIDEKIQ_WORKER_ID" => first_worker_id.to_s } } let(:args) { ['bundle', 'exec', 'sidekiq', anything, '-eproduction', *([anything] * 5)] } it 'starts a Sidekiq process' do allow(Process).to receive(:spawn).and_return(1) expect(described_class).to receive(:wait_async).with(1) - expect(described_class.start_sidekiq(%w(foo), :production)).to eq(1) + expect(described_class.start_sidekiq(%w(foo), options)).to eq(1) end it 'handles duplicate queue names' do @@ -97,7 +105,7 @@ .and_return(1) expect(described_class).to receive(:wait_async).with(1) - expect(described_class.start_sidekiq(%w(foo foo bar baz), :production)).to eq(1) + expect(described_class.start_sidekiq(%w(foo foo bar baz), options)).to eq(1) end it 'runs the sidekiq process in a new process group' do @@ -107,7 +115,7 @@ .and_return(1) allow(described_class).to receive(:wait_async) - expect(described_class.start_sidekiq(%w(foo bar baz), :production)).to eq(1) + expect(described_class.start_sidekiq(%w(foo bar baz), options)).to eq(1) end end diff --git a/lib/gitlab/sidekiq_middleware/metrics.rb b/lib/gitlab/sidekiq_middleware/metrics.rb index 8af353d8674f..86762a2d02b3 100644 --- a/lib/gitlab/sidekiq_middleware/metrics.rb +++ b/lib/gitlab/sidekiq_middleware/metrics.rb @@ -9,6 +9,8 @@ class Metrics def initialize @metrics = init_metrics + + @metrics[:sidekiq_concurrency].set({}, Sidekiq.options[:concurrency].to_i) end def call(_worker, job, queue) @@ -45,7 +47,8 @@ def init_metrics sidekiq_jobs_completion_seconds: ::Gitlab::Metrics.histogram(:sidekiq_jobs_completion_seconds, 'Seconds to complete sidekiq job', {}, SIDEKIQ_LATENCY_BUCKETS), sidekiq_jobs_failed_total: ::Gitlab::Metrics.counter(:sidekiq_jobs_failed_total, 'Sidekiq jobs failed'), sidekiq_jobs_retried_total: ::Gitlab::Metrics.counter(:sidekiq_jobs_retried_total, 'Sidekiq jobs retried'), - sidekiq_running_jobs: ::Gitlab::Metrics.gauge(:sidekiq_running_jobs, 'Number of Sidekiq jobs running', {}, :livesum) + sidekiq_running_jobs: ::Gitlab::Metrics.gauge(:sidekiq_running_jobs, 'Number of Sidekiq jobs running', {}, :all), + sidekiq_concurrency: ::Gitlab::Metrics.gauge(:sidekiq_concurrency, 'Maximum number of Sidekiq jobs', {}, :all) } end diff --git a/lib/prometheus/pid_provider.rb b/lib/prometheus/pid_provider.rb index e0f7e7e0a9e0..228639357ac0 100644 --- a/lib/prometheus/pid_provider.rb +++ b/lib/prometheus/pid_provider.rb @@ -6,7 +6,7 @@ module PidProvider def worker_id if Sidekiq.server? - 'sidekiq' + sidekiq_worker_id elsif defined?(Unicorn::Worker) unicorn_worker_id elsif defined?(::Puma) @@ -18,6 +18,14 @@ def worker_id private + def sidekiq_worker_id + if worker = ENV['SIDEKIQ_WORKER_ID'] + "sidekiq_#{worker}" + else + 'sidekiq' + end + end + def unicorn_worker_id if matches = process_name.match(/unicorn.*worker\[([0-9]+)\]/) "unicorn_#{matches[1]}" diff --git a/spec/lib/gitlab/sidekiq_middleware/metrics_spec.rb b/spec/lib/gitlab/sidekiq_middleware/metrics_spec.rb index 806112fcb166..ec4c8560f228 100644 --- a/spec/lib/gitlab/sidekiq_middleware/metrics_spec.rb +++ b/spec/lib/gitlab/sidekiq_middleware/metrics_spec.rb @@ -3,25 +3,37 @@ require 'spec_helper' describe Gitlab::SidekiqMiddleware::Metrics do - describe '#call' do - let(:middleware) { described_class.new } - let(:worker) { double(:worker) } - - let(:completion_seconds_metric) { double('completion seconds metric') } - let(:user_execution_seconds_metric) { double('user execution seconds metric') } - let(:failed_total_metric) { double('failed total metric') } - let(:retried_total_metric) { double('retried total metric') } - let(:running_jobs_metric) { double('running jobs metric') } + let(:middleware) { described_class.new } + + let(:concurrency_metric) { double('concurrency metric') } + let(:completion_seconds_metric) { double('completion seconds metric') } + let(:user_execution_seconds_metric) { double('user execution seconds metric') } + let(:failed_total_metric) { double('failed total metric') } + let(:retried_total_metric) { double('retried total metric') } + let(:running_jobs_metric) { double('running jobs metric') } + + before do + allow(Gitlab::Metrics).to receive(:histogram).with(:sidekiq_jobs_completion_seconds, anything, anything, anything).and_return(completion_seconds_metric) + allow(Gitlab::Metrics).to receive(:histogram).with(:sidekiq_jobs_cpu_seconds, anything, anything, anything).and_return(user_execution_seconds_metric) + allow(Gitlab::Metrics).to receive(:counter).with(:sidekiq_jobs_failed_total, anything).and_return(failed_total_metric) + allow(Gitlab::Metrics).to receive(:counter).with(:sidekiq_jobs_retried_total, anything).and_return(retried_total_metric) + allow(Gitlab::Metrics).to receive(:gauge).with(:sidekiq_running_jobs, anything, {}, :all).and_return(running_jobs_metric) + allow(Gitlab::Metrics).to receive(:gauge).with(:sidekiq_concurrency, anything, {}, :all).and_return(concurrency_metric) + + allow(running_jobs_metric).to receive(:increment) + allow(concurrency_metric).to receive(:set) + end - before do - allow(Gitlab::Metrics).to receive(:histogram).with(:sidekiq_jobs_completion_seconds, anything, anything, anything).and_return(completion_seconds_metric) - allow(Gitlab::Metrics).to receive(:histogram).with(:sidekiq_jobs_cpu_seconds, anything, anything, anything).and_return(user_execution_seconds_metric) - allow(Gitlab::Metrics).to receive(:counter).with(:sidekiq_jobs_failed_total, anything).and_return(failed_total_metric) - allow(Gitlab::Metrics).to receive(:counter).with(:sidekiq_jobs_retried_total, anything).and_return(retried_total_metric) - allow(Gitlab::Metrics).to receive(:gauge).with(:sidekiq_running_jobs, anything, {}, :livesum).and_return(running_jobs_metric) + describe '#initialize' do + it 'sets general metrics' do + expect(concurrency_metric).to receive(:set).with({}, Sidekiq.options[:concurrency].to_i) - allow(running_jobs_metric).to receive(:increment) + middleware end + end + + describe '#call' do + let(:worker) { double(:worker) } it 'yields block' do allow(completion_seconds_metric).to receive(:observe) @@ -30,7 +42,7 @@ expect { |b| middleware.call(worker, {}, :test, &b) }.to yield_control.once end - it 'sets metrics' do + it 'sets queue specific metrics' do labels = { queue: :test } allow(middleware).to receive(:get_thread_cputime).and_return(1, 3) diff --git a/spec/lib/prometheus/pid_provider_spec.rb b/spec/lib/prometheus/pid_provider_spec.rb index ba843b272548..6fdc11b14c40 100644 --- a/spec/lib/prometheus/pid_provider_spec.rb +++ b/spec/lib/prometheus/pid_provider_spec.rb @@ -18,7 +18,17 @@ expect(Sidekiq).to receive(:server?).and_return(true) end - it { is_expected.to eq 'sidekiq' } + context 'in a clustered setup' do + before do + stub_env('SIDEKIQ_WORKER_ID', '123') + end + + it { is_expected.to eq 'sidekiq_123' } + end + + context 'in a single process setup' do + it { is_expected.to eq 'sidekiq' } + end end context 'when running in Unicorn mode' do -- GitLab