From 5e2f02c38e964f9f31074611fdffae94d3f6b6d3 Mon Sep 17 00:00:00 2001 From: Roy Zwambag <rzwambag@gitlab.com> Date: Wed, 8 Dec 2021 12:16:21 +0000 Subject: [PATCH] Start a separate metrics server that can serve Sidekiq metrics --- bin/metrics-server | 3 +- lib/gitlab/process_management.rb | 6 + metrics_server/dependencies.rb | 1 + metrics_server/metrics_server.rb | 12 +- sidekiq_cluster/cli.rb | 57 ++++++ sidekiq_cluster/dependencies.rb | 6 - sidekiq_cluster/sidekiq_cluster.rb | 22 ++- .../metrics_server/metrics_server_spec.rb | 2 +- spec/commands/sidekiq_cluster/cli_spec.rb | 172 ++++++++++++++++++ spec/lib/gitlab/process_management_spec.rb | 28 ++- spec/metrics_server/metrics_server_spec.rb | 26 ++- spec/sidekiq_cluster/sidekiq_cluster_spec.rb | 2 + spec/tooling/quality/test_level_spec.rb | 4 +- tooling/quality/test_level.rb | 1 + 14 files changed, 312 insertions(+), 30 deletions(-) delete mode 100644 sidekiq_cluster/dependencies.rb diff --git a/bin/metrics-server b/bin/metrics-server index 48d3a6402c3ee..16c98f65f52ba 100755 --- a/bin/metrics-server +++ b/bin/metrics-server @@ -8,8 +8,9 @@ begin raise "Required: METRICS_SERVER_TARGET=[sidekiq]" unless target == 'sidekiq' metrics_dir = ENV["prometheus_multiproc_dir"] || File.absolute_path("tmp/prometheus_multiproc_dir/#{target}") + wipe_metrics_dir = Gitlab::Utils.to_boolean(ENV['WIPE_METRICS_DIR']) || false # Re-raise exceptions in threads on the main thread. Thread.abort_on_exception = true - MetricsServer.new(target, metrics_dir).start + MetricsServer.new(target, metrics_dir, wipe_metrics_dir).start end diff --git a/lib/gitlab/process_management.rb b/lib/gitlab/process_management.rb index 2c2aaadb29ac9..604b6129648e2 100644 --- a/lib/gitlab/process_management.rb +++ b/lib/gitlab/process_management.rb @@ -70,11 +70,17 @@ def self.pids_alive(pids) end def self.process_alive?(pid) + return false if pid.nil? + # Signal 0 tests whether the process exists and we have access to send signals # but is otherwise a noop (doesn't actually send a signal to the process) signal(pid, 0) end + def self.process_died?(pid) + !process_alive?(pid) + end + def self.write_pid(path) File.open(path, 'w') do |handle| handle.write(Process.pid.to_s) diff --git a/metrics_server/dependencies.rb b/metrics_server/dependencies.rb index ecfef502febb6..80e68b95ed1d0 100644 --- a/metrics_server/dependencies.rb +++ b/metrics_server/dependencies.rb @@ -13,6 +13,7 @@ require_relative 'settings_overrides' require_relative '../lib/gitlab/daemon' +require_relative '../lib/gitlab/utils' require_relative '../lib/gitlab/utils/strong_memoize' require_relative '../lib/prometheus/cleanup_multiproc_dir_service' require_relative '../lib/gitlab/metrics/prometheus' diff --git a/metrics_server/metrics_server.rb b/metrics_server/metrics_server.rb index 09171d8220b02..9dc3ba9153681 100644 --- a/metrics_server/metrics_server.rb +++ b/metrics_server/metrics_server.rb @@ -1,16 +1,17 @@ # frozen_string_literal: true -require_relative '../config/bundler_setup' +require_relative '../config/boot' require_relative 'dependencies' class MetricsServer # rubocop:disable Gitlab/NamespacedClass class << self - def spawn(target, gitlab_config: nil) + def spawn(target, gitlab_config: nil, wipe_metrics_dir: false) cmd = "#{Rails.root}/bin/metrics-server" env = { 'METRICS_SERVER_TARGET' => target, - 'GITLAB_CONFIG' => gitlab_config + 'GITLAB_CONFIG' => gitlab_config, + 'WIPE_METRICS_DIR' => wipe_metrics_dir.to_s } Process.spawn(env, cmd, err: $stderr, out: $stdout).tap do |pid| @@ -19,9 +20,10 @@ def spawn(target, gitlab_config: nil) end end - def initialize(target, metrics_dir) + def initialize(target, metrics_dir, wipe_metrics_dir) @target = target @metrics_dir = metrics_dir + @wipe_metrics_dir = wipe_metrics_dir end def start @@ -30,7 +32,7 @@ def start end FileUtils.mkdir_p(@metrics_dir, mode: 0700) - ::Prometheus::CleanupMultiprocDirService.new.execute + ::Prometheus::CleanupMultiprocDirService.new.execute if @wipe_metrics_dir settings = Settings.monitoring.sidekiq_exporter exporter_class = "Gitlab::Metrics::Exporter::#{@target.camelize}Exporter".constantize diff --git a/sidekiq_cluster/cli.rb b/sidekiq_cluster/cli.rb index 2dbb1e9c7c792..274b7c03e14bb 100644 --- a/sidekiq_cluster/cli.rb +++ b/sidekiq_cluster/cli.rb @@ -1,5 +1,7 @@ # frozen_string_literal: true +require_relative '../config/bundler_setup' + require 'optparse' require 'logger' require 'time' @@ -12,6 +14,7 @@ require_relative '../lib/gitlab/sidekiq_config/worker_matcher' require_relative '../lib/gitlab/sidekiq_logging/json_formatter' require_relative '../lib/gitlab/process_management' +require_relative '../metrics_server/metrics_server' require_relative 'sidekiq_cluster' module Gitlab @@ -89,6 +92,8 @@ def run(argv = ARGV) @logger.info("Starting cluster with #{queue_groups.length} processes") end + start_metrics_server(wipe_metrics_dir: true) + @processes = SidekiqCluster.start( queue_groups, env: @environment, @@ -154,17 +159,69 @@ def start_loop while @alive sleep(@interval) + if metrics_server_enabled? && ProcessManagement.process_died?(@metrics_server_pid) + @logger.warn('Metrics server went away') + start_metrics_server(wipe_metrics_dir: false) + end + unless ProcessManagement.all_alive?(@processes) # If a child process died we'll just terminate the whole cluster. It's up to # runit and such to then restart the cluster. @logger.info('A worker terminated, shutting down the cluster') + stop_metrics_server ProcessManagement.signal_processes(@processes, :TERM) break end end end + def start_metrics_server(wipe_metrics_dir: false) + return unless metrics_server_enabled? + + @logger.info("Starting metrics server on port #{sidekiq_exporter_port}") + @metrics_server_pid = MetricsServer.spawn('sidekiq', wipe_metrics_dir: wipe_metrics_dir) + end + + def sidekiq_exporter_enabled? + ::Settings.monitoring.sidekiq_exporter.enabled + rescue Settingslogic::MissingSetting + nil + end + + def exporter_has_a_unique_port? + # In https://gitlab.com/gitlab-org/gitlab/-/issues/345802 we added settings for sidekiq_health_checks. + # These settings default to the same values as sidekiq_exporter for backwards compatibility. + # If a different port for sidekiq_health_checks has been set up, we know that the + # user wants to serve health checks and metrics from different servers. + return false if sidekiq_health_check_port.nil? || sidekiq_exporter_port.nil? + + sidekiq_exporter_port != sidekiq_health_check_port + end + + def sidekiq_exporter_port + ::Settings.monitoring.sidekiq_exporter.port + rescue Settingslogic::MissingSetting + nil + end + + def sidekiq_health_check_port + ::Settings.monitoring.sidekiq_health_checks.port + rescue Settingslogic::MissingSetting + nil + end + + def metrics_server_enabled? + !@dryrun && sidekiq_exporter_enabled? && exporter_has_a_unique_port? + end + + def stop_metrics_server + return unless @metrics_server_pid + + @logger.info("Stopping metrics server (PID #{@metrics_server_pid})") + ProcessManagement.signal(@metrics_server_pid, :TERM) + end + def option_parser OptionParser.new do |opt| opt.banner = "#{File.basename(__FILE__)} [QUEUE,QUEUE] [QUEUE] ... [OPTIONS]" diff --git a/sidekiq_cluster/dependencies.rb b/sidekiq_cluster/dependencies.rb deleted file mode 100644 index 91e91475f1592..0000000000000 --- a/sidekiq_cluster/dependencies.rb +++ /dev/null @@ -1,6 +0,0 @@ -# rubocop:disable Naming/FileName -# frozen_string_literal: true - -require 'shellwords' - -# rubocop:enable Naming/FileName diff --git a/sidekiq_cluster/sidekiq_cluster.rb b/sidekiq_cluster/sidekiq_cluster.rb index c3aa9e05a09b4..c5139ab887467 100644 --- a/sidekiq_cluster/sidekiq_cluster.rb +++ b/sidekiq_cluster/sidekiq_cluster.rb @@ -1,6 +1,5 @@ # frozen_string_literal: true -require_relative 'dependencies' require_relative '../lib/gitlab/process_management' module Gitlab @@ -67,14 +66,19 @@ def self.start_sidekiq(queues, env:, directory:, max_concurrency:, min_concurren return end - pid = Process.spawn( - { 'ENABLE_SIDEKIQ_CLUSTER' => '1', - 'SIDEKIQ_WORKER_ID' => worker_id.to_s }, - *cmd, - pgroup: true, - err: $stderr, - out: $stdout - ) + # We need to remove Bundler specific env vars, since otherwise the + # child process will think we are passing an alternative Gemfile + # and will clear and reset LOAD_PATH. + pid = Bundler.with_original_env do + Process.spawn( + { 'ENABLE_SIDEKIQ_CLUSTER' => '1', + 'SIDEKIQ_WORKER_ID' => worker_id.to_s }, + *cmd, + pgroup: true, + err: $stderr, + out: $stdout + ) + end ProcessManagement.wait_async(pid) diff --git a/spec/commands/metrics_server/metrics_server_spec.rb b/spec/commands/metrics_server/metrics_server_spec.rb index 4eff9136f2b6b..7a03ff3fe7565 100644 --- a/spec/commands/metrics_server/metrics_server_spec.rb +++ b/spec/commands/metrics_server/metrics_server_spec.rb @@ -29,7 +29,7 @@ config_file.write(YAML.dump(config)) config_file.close - @pid = MetricsServer.spawn('sidekiq', gitlab_config: config_file.path) + @pid = MetricsServer.spawn('sidekiq', gitlab_config: config_file.path, wipe_metrics_dir: true) end after do diff --git a/spec/commands/sidekiq_cluster/cli_spec.rb b/spec/commands/sidekiq_cluster/cli_spec.rb index f5642c00ca412..c057660808875 100644 --- a/spec/commands/sidekiq_cluster/cli_spec.rb +++ b/spec/commands/sidekiq_cluster/cli_spec.rb @@ -12,8 +12,23 @@ { env: 'test', directory: Dir.pwd, max_concurrency: 50, min_concurrency: 0, dryrun: false, timeout: timeout } end + let(:sidekiq_exporter_enabled) { false } + let(:sidekiq_exporter_port) { '3807' } + let(:sidekiq_health_checks_port) { '3807' } + before do stub_env('RAILS_ENV', 'test') + stub_config( + monitoring: { + sidekiq_exporter: { + enabled: sidekiq_exporter_enabled, + port: sidekiq_exporter_port + }, + sidekiq_health_checks: { + port: sidekiq_health_checks_port + } + } + ) end describe '#run' do @@ -241,6 +256,163 @@ end end end + + context 'metrics server' do + context 'starting the server' do + context 'without --dryrun' do + context 'when there are no sidekiq_health_checks settings set' do + before do + stub_config( + monitoring: { + sidekiq_exporter: { + enabled: true, + port: sidekiq_exporter_port + } + } + ) + + allow(Gitlab::SidekiqCluster).to receive(:start) + allow(cli).to receive(:write_pid) + allow(cli).to receive(:trap_signals) + allow(cli).to receive(:start_loop) + end + + it 'does not start a sidekiq metrics server' do + expect(MetricsServer).not_to receive(:spawn) + + cli.run(%w(foo)) + end + + it 'rescues Settingslogic::MissingSetting' do + expect { cli.run(%w(foo)) }.not_to raise_error(Settingslogic::MissingSetting) + end + end + + context 'when the sidekiq_exporter.port setting is not set' do + before do + stub_config( + monitoring: { + sidekiq_exporter: { + enabled: true + }, + sidekiq_health_checks: { + port: sidekiq_health_checks_port + } + } + ) + + allow(Gitlab::SidekiqCluster).to receive(:start) + allow(cli).to receive(:write_pid) + allow(cli).to receive(:trap_signals) + allow(cli).to receive(:start_loop) + end + + it 'does not start a sidekiq metrics server' do + expect(MetricsServer).not_to receive(:spawn) + + cli.run(%w(foo)) + end + + it 'rescues Settingslogic::MissingSetting' do + expect { cli.run(%w(foo)) }.not_to raise_error(Settingslogic::MissingSetting) + end + end + + context 'when sidekiq_exporter.enabled setting is not set' do + before do + stub_config( + monitoring: { + sidekiq_exporter: {}, + sidekiq_health_checks: { + port: sidekiq_health_checks_port + } + } + ) + + allow(Gitlab::SidekiqCluster).to receive(:start) + allow(cli).to receive(:write_pid) + allow(cli).to receive(:trap_signals) + allow(cli).to receive(:start_loop) + end + + it 'does not start a sidekiq metrics server' do + expect(MetricsServer).not_to receive(:spawn) + + cli.run(%w(foo)) + end + end + + using RSpec::Parameterized::TableSyntax + + where(:sidekiq_exporter_enabled, :sidekiq_exporter_port, :sidekiq_health_checks_port, :start_metrics_server) do + true | '3807' | '3907' | true + true | '3807' | '3807' | false + false | '3807' | '3907' | false + false | '3807' | '3907' | false + end + + with_them do + before do + allow(Gitlab::SidekiqCluster).to receive(:start) + allow(cli).to receive(:write_pid) + allow(cli).to receive(:trap_signals) + allow(cli).to receive(:start_loop) + end + + specify do + if start_metrics_server + expect(MetricsServer).to receive(:spawn).with('sidekiq', wipe_metrics_dir: true) + else + expect(MetricsServer).not_to receive(:spawn) + end + + cli.run(%w(foo)) + end + end + end + + context 'with --dryrun set' do + let(:sidekiq_exporter_enabled) { true } + + it 'does not start the server' do + expect(MetricsServer).not_to receive(:spawn) + + cli.run(%w(foo --dryrun)) + end + end + end + + context 'supervising the server' do + let(:sidekiq_exporter_enabled) { true } + let(:sidekiq_health_checks_port) { '3907' } + + before do + allow(cli).to receive(:sleep).with(a_kind_of(Numeric)) + allow(MetricsServer).to receive(:spawn).with('sidekiq', wipe_metrics_dir: false).and_return(99) + cli.start_metrics_server + end + + it 'stops the metrics server when one of the processes has been terminated' do + allow(Gitlab::ProcessManagement).to receive(:process_died?).and_return(false) + allow(Gitlab::ProcessManagement).to receive(:all_alive?).with(an_instance_of(Array)).and_return(false) + allow(Gitlab::ProcessManagement).to receive(:signal_processes).with(an_instance_of(Array), :TERM) + + expect(Process).to receive(:kill).with(:TERM, 99) + + cli.start_loop + end + + it 'starts the metrics server when it is down' do + allow(Gitlab::ProcessManagement).to receive(:process_died?).and_return(true) + allow(Gitlab::ProcessManagement).to receive(:all_alive?).with(an_instance_of(Array)).and_return(false) + allow(cli).to receive(:stop_metrics_server) + + expect(MetricsServer).to receive(:spawn).with('sidekiq', wipe_metrics_dir: false) + + cli.start_loop + end + end + end end describe '#write_pid' do diff --git a/spec/lib/gitlab/process_management_spec.rb b/spec/lib/gitlab/process_management_spec.rb index 71c4acdecb906..4e367e258f2e8 100644 --- a/spec/lib/gitlab/process_management_spec.rb +++ b/spec/lib/gitlab/process_management_spec.rb @@ -78,7 +78,7 @@ end describe '.process_alive?' do - it 'returns true if the proces is alive' do + it 'returns true if the process is alive' do process = Process.pid expect(described_class.process_alive?(process)).to eq(true) @@ -89,6 +89,32 @@ expect(described_class.process_alive?(process)).to eq(false) end + + it 'returns false when no pid is given' do + process = nil + + expect(described_class.process_alive?(process)).to eq(false) + end + end + + describe '.process_died?' do + it 'returns false if the process is alive' do + process = Process.pid + + expect(described_class.process_died?(process)).to eq(false) + end + + it 'returns true when a thread was not alive' do + process = -2 + + expect(described_class.process_died?(process)).to eq(true) + end + + it 'returns true when no pid is given' do + process = nil + + expect(described_class.process_died?(process)).to eq(true) + end end describe '.pids_alive' do diff --git a/spec/metrics_server/metrics_server_spec.rb b/spec/metrics_server/metrics_server_spec.rb index 58556d2cf5aef..fcd5b911d2a7d 100644 --- a/spec/metrics_server/metrics_server_spec.rb +++ b/spec/metrics_server/metrics_server_spec.rb @@ -12,7 +12,8 @@ let(:env) do { 'METRICS_SERVER_TARGET' => 'sidekiq', - 'GITLAB_CONFIG' => nil + 'GITLAB_CONFIG' => nil, + 'WIPE_METRICS_DIR' => 'false' } end @@ -32,7 +33,7 @@ let(:metrics_dir) { Dir.mktmpdir } let(:settings_double) { double(:settings, sidekiq_exporter: {}) } - subject(:metrics_server) { described_class.new('fake', metrics_dir)} + subject(:metrics_server) { described_class.new('fake', metrics_dir, true)} before do stub_env('prometheus_multiproc_dir', metrics_dir) @@ -42,6 +43,7 @@ end after do + ::Prometheus::CleanupMultiprocDirService.new.execute Dir.rmdir(metrics_dir) end @@ -59,10 +61,24 @@ metrics_server.start end - it 'removes any old metrics files' do - FileUtils.touch("#{metrics_dir}/remove_this.db") + context 'when wipe_metrics_dir is true' do + subject(:metrics_server) { described_class.new('fake', metrics_dir, true)} - expect { metrics_server.start }.to change { Dir.empty?(metrics_dir) }.from(false).to(true) + it 'removes any old metrics files' do + FileUtils.touch("#{metrics_dir}/remove_this.db") + + expect { metrics_server.start }.to change { Dir.empty?(metrics_dir) }.from(false).to(true) + end + end + + context 'when wipe_metrics_dir is false' do + subject(:metrics_server) { described_class.new('fake', metrics_dir, false)} + + it 'does not remove any old metrics files' do + FileUtils.touch("#{metrics_dir}/remove_this.db") + + expect { metrics_server.start }.not_to change { Dir.empty?(metrics_dir) }.from(false) + end end it 'starts a metrics server' do diff --git a/spec/sidekiq_cluster/sidekiq_cluster_spec.rb b/spec/sidekiq_cluster/sidekiq_cluster_spec.rb index 86b3fc6a48c1e..c0a919a4aec8d 100644 --- a/spec/sidekiq_cluster/sidekiq_cluster_spec.rb +++ b/spec/sidekiq_cluster/sidekiq_cluster_spec.rb @@ -13,6 +13,8 @@ out: $stdout } + expect(Bundler).to receive(:with_original_env).and_call_original.twice + expect(Process).to receive(:spawn).ordered.with({ "ENABLE_SIDEKIQ_CLUSTER" => "1", "SIDEKIQ_WORKER_ID" => "0" diff --git a/spec/tooling/quality/test_level_spec.rb b/spec/tooling/quality/test_level_spec.rb index 3348e49573267..8a944a473d735 100644 --- a/spec/tooling/quality/test_level_spec.rb +++ b/spec/tooling/quality/test_level_spec.rb @@ -28,7 +28,7 @@ context 'when level is unit' do it 'returns a pattern' do expect(subject.pattern(:unit)) - .to eq("spec/{bin,channels,config,db,dependencies,elastic,elastic_integration,experiments,factories,finders,frontend,graphql,haml_lint,helpers,initializers,javascripts,lib,metrics_server,models,policies,presenters,rack_servers,replicators,routing,rubocop,scripts,serializers,services,sidekiq,spam,support_specs,tasks,uploaders,validators,views,workers,tooling}{,/**/}*_spec.rb") + .to eq("spec/{bin,channels,config,db,dependencies,elastic,elastic_integration,experiments,factories,finders,frontend,graphql,haml_lint,helpers,initializers,javascripts,lib,metrics_server,models,policies,presenters,rack_servers,replicators,routing,rubocop,scripts,serializers,services,sidekiq,sidekiq_cluster,spam,support_specs,tasks,uploaders,validators,views,workers,tooling}{,/**/}*_spec.rb") end end @@ -110,7 +110,7 @@ context 'when level is unit' do it 'returns a regexp' do expect(subject.regexp(:unit)) - .to eq(%r{spec/(bin|channels|config|db|dependencies|elastic|elastic_integration|experiments|factories|finders|frontend|graphql|haml_lint|helpers|initializers|javascripts|lib|metrics_server|models|policies|presenters|rack_servers|replicators|routing|rubocop|scripts|serializers|services|sidekiq|spam|support_specs|tasks|uploaders|validators|views|workers|tooling)}) + .to eq(%r{spec/(bin|channels|config|db|dependencies|elastic|elastic_integration|experiments|factories|finders|frontend|graphql|haml_lint|helpers|initializers|javascripts|lib|metrics_server|models|policies|presenters|rack_servers|replicators|routing|rubocop|scripts|serializers|services|sidekiq|sidekiq_cluster|spam|support_specs|tasks|uploaders|validators|views|workers|tooling)}) end end diff --git a/tooling/quality/test_level.rb b/tooling/quality/test_level.rb index 3cc1d87eb36d8..50cbc69beb214 100644 --- a/tooling/quality/test_level.rb +++ b/tooling/quality/test_level.rb @@ -45,6 +45,7 @@ class TestLevel serializers services sidekiq + sidekiq_cluster spam support_specs tasks -- GitLab