diff --git a/config/initializers/7_prometheus_metrics.rb b/config/initializers/7_prometheus_metrics.rb index a65b1041d8379b9484bf3859c958605ff46cd946..81e55a139c2926acf87edd90fe007f172d6b82f6 100644 --- a/config/initializers/7_prometheus_metrics.rb +++ b/config/initializers/7_prometheus_metrics.rb @@ -1,5 +1,7 @@ # frozen_string_literal: true +PUMA_EXTERNAL_METRICS_SERVER = Gitlab::Utils.to_boolean(ENV['PUMA_EXTERNAL_METRICS_SERVER']) + # Keep separate directories for separate processes def prometheus_default_multiproc_dir return unless Rails.env.development? || Rails.env.test? @@ -72,8 +74,12 @@ def sidekiq_metrics_server_process? if Gitlab::Runtime.puma? Gitlab::Metrics::Samplers::PumaSampler.instance.start - # Starts a metrics server to export metrics from the Puma primary. - Gitlab::Metrics::Exporter::WebExporter.instance.start + if Settings.monitoring.web_exporter.enabled && PUMA_EXTERNAL_METRICS_SERVER + require_relative '../../metrics_server/metrics_server' + MetricsServer.start_for_puma + else + Gitlab::Metrics::Exporter::WebExporter.instance.start + end end Gitlab::Ci::Parsers.instrument! @@ -90,11 +96,13 @@ def sidekiq_metrics_server_process? Gitlab::Metrics::Samplers::ThreadsSampler.initialize_instance(logger: logger).start if Gitlab::Runtime.puma? - # Since we are running a metrics server on the Puma primary, we would inherit - # this thread after forking into workers, so we need to explicitly stop it here. - # NOTE: This will not be necessary anymore after moving to an external server - # process via https://gitlab.com/gitlab-org/gitlab/-/issues/350548 - Gitlab::Metrics::Exporter::WebExporter.instance.stop + # Since we are observing a metrics server from the Puma primary, we would inherit + # this supervision thread after forking into workers, so we need to explicitly stop it here. + if PUMA_EXTERNAL_METRICS_SERVER + ::MetricsServer::PumaProcessSupervisor.instance.stop + else + Gitlab::Metrics::Exporter::WebExporter.instance.stop + end Gitlab::Metrics::Samplers::ActionCableSampler.instance(logger: logger).start end @@ -112,16 +120,24 @@ def sidekiq_metrics_server_process? if Gitlab::Runtime.puma? Gitlab::Cluster::LifecycleEvents.on_before_graceful_shutdown do # We need to ensure that before we re-exec or shutdown server - # we do stop the exporter - Gitlab::Metrics::Exporter::WebExporter.instance.stop + # we also stop the metrics server + if PUMA_EXTERNAL_METRICS_SERVER + ::MetricsServer::PumaProcessSupervisor.instance.shutdown + else + Gitlab::Metrics::Exporter::WebExporter.instance.stop + end end Gitlab::Cluster::LifecycleEvents.on_before_master_restart do # We need to ensure that before we re-exec server - # we do stop the exporter + # we also stop the metrics server # # We do it again, for being extra safe, # but it should not be needed - Gitlab::Metrics::Exporter::WebExporter.instance.stop + if PUMA_EXTERNAL_METRICS_SERVER + ::MetricsServer::PumaProcessSupervisor.instance.shutdown + else + Gitlab::Metrics::Exporter::WebExporter.instance.stop + end end end diff --git a/lib/gitlab/process_supervisor.rb b/lib/gitlab/process_supervisor.rb index f0d2bbc33bdc71dacae89254d1e2bbbaea019d8f..18fd24aa582702817bff7714db48f397156a4954 100644 --- a/lib/gitlab/process_supervisor.rb +++ b/lib/gitlab/process_supervisor.rb @@ -9,7 +9,7 @@ module Gitlab # The supervisor will also trap termination signals if provided and # propagate those to the supervised processes. Any supervised processes # that do not terminate within a specified grace period will be killed. - class ProcessSupervisor + class ProcessSupervisor < Gitlab::Daemon DEFAULT_HEALTH_CHECK_INTERVAL_SECONDS = 5 DEFAULT_TERMINATE_INTERVAL_SECONDS = 1 DEFAULT_TERMINATE_TIMEOUT_SECONDS = 10 @@ -21,13 +21,18 @@ def initialize( check_terminate_interval_seconds: DEFAULT_TERMINATE_INTERVAL_SECONDS, terminate_timeout_seconds: DEFAULT_TERMINATE_TIMEOUT_SECONDS, term_signals: %i(INT TERM), - forwarded_signals: []) + forwarded_signals: [], + **options) + super(**options) @term_signals = term_signals @forwarded_signals = forwarded_signals @health_check_interval_seconds = health_check_interval_seconds @check_terminate_interval_seconds = check_terminate_interval_seconds @terminate_timeout_seconds = terminate_timeout_seconds + + @pids = [] + @alive = false end # Starts a supervision loop for the given process ID(s). @@ -39,32 +44,66 @@ def initialize( # start observing those processes instead. Otherwise it will shut down. def supervise(pid_or_pids, &on_process_death) @pids = Array(pid_or_pids) + @on_process_death = on_process_death trap_signals! + start + end + + # Shuts down the supervisor and all supervised processes with the given signal. + def shutdown(signal = :TERM) + return unless @alive + + stop_processes(signal) + stop + end + + def supervised_pids + @pids + end + + private + + def start_working @alive = true + end + + def stop_working + @alive = false + end + + def run_thread while @alive sleep(@health_check_interval_seconds) - check_process_health(&on_process_death) + check_process_health end end - private - - def check_process_health(&on_process_death) + def check_process_health unless all_alive? - dead_pids = @pids - live_pids - @pids = Array(yield(dead_pids)) + existing_pids = live_pids # Capture this value for the duration of the block. + dead_pids = @pids - existing_pids + new_pids = Array(@on_process_death.call(dead_pids)) + @pids = existing_pids + new_pids @alive = @pids.any? end end + def stop_processes(signal) + # Set this prior to shutting down so that shutdown hooks which read `alive` + # know the supervisor is about to shut down. + @alive = false + + # Shut down supervised processes. + signal_all(signal) + wait_for_termination + end + def trap_signals! ProcessManagement.trap_signals(@term_signals) do |signal| - @alive = false - signal_all(signal) - wait_for_termination + stop_processes(signal) end ProcessManagement.trap_signals(@forwarded_signals) do |signal| diff --git a/metrics_server/dependencies.rb b/metrics_server/dependencies.rb index 02cec1173e0ca1a2660183b3df55b7d0e85e11f8..bfa6aae8ef82b7c0f3f7d29f55a39dcd0689ec84 100644 --- a/metrics_server/dependencies.rb +++ b/metrics_server/dependencies.rb @@ -31,5 +31,6 @@ require_relative '../lib/gitlab/health_checks/probes/collection' require_relative '../lib/gitlab/health_checks/probes/status' require_relative '../lib/gitlab/process_management' +require_relative '../lib/gitlab/process_supervisor' # rubocop:enable Naming/FileName diff --git a/metrics_server/metrics_server.rb b/metrics_server/metrics_server.rb index 70769459019c70f38dc1ea7b13d3dc7841d0d10c..5411aaa0b5fcf101ca27d322e58c151a6bd957a8 100644 --- a/metrics_server/metrics_server.rb +++ b/metrics_server/metrics_server.rb @@ -5,7 +5,28 @@ require_relative 'dependencies' class MetricsServer # rubocop:disable Gitlab/NamespacedClass + # The singleton instance used to supervise the Puma metrics server. + PumaProcessSupervisor = Class.new(Gitlab::ProcessSupervisor) + class << self + def start_for_puma + metrics_dir = ::Prometheus::Client.configuration.multiprocess_files_dir + + start_server = proc do + MetricsServer.spawn('puma', metrics_dir: metrics_dir).tap do |pid| + Gitlab::AppLogger.info("Starting Puma metrics server with pid #{pid}") + end + end + + supervisor = PumaProcessSupervisor.instance + supervisor.supervise(start_server.call) do + next unless supervisor.alive + + Gitlab::AppLogger.info('Puma metrics server terminated, restarting...') + start_server.call + end + end + def spawn(target, metrics_dir:, gitlab_config: nil, wipe_metrics_dir: false) ensure_valid_target!(target) diff --git a/sidekiq_cluster/cli.rb b/sidekiq_cluster/cli.rb index f366cb26b8ef8cd13fc37a80ca44c578354bb771..737d31bbc88ca4cd5e4bf88b0077794a810cc48d 100644 --- a/sidekiq_cluster/cli.rb +++ b/sidekiq_cluster/cli.rb @@ -10,6 +10,7 @@ # we may run into "already initialized" warnings, hence the check. require_relative '../lib/gitlab' unless Object.const_defined?('Gitlab') require_relative '../lib/gitlab/utils' +require_relative '../lib/gitlab/daemon' require_relative '../lib/gitlab/sidekiq_config/cli_methods' require_relative '../lib/gitlab/sidekiq_config/worker_matcher' require_relative '../lib/gitlab/sidekiq_logging/json_formatter' @@ -121,11 +122,12 @@ def start_and_supervise_workers(queue_groups) ProcessManagement.write_pid(@pid) if @pid - supervisor = Gitlab::ProcessSupervisor.new( + supervisor = SidekiqProcessSupervisor.instance( health_check_interval_seconds: @interval, terminate_timeout_seconds: @soft_timeout_seconds + TIMEOUT_GRACE_PERIOD_SECONDS, term_signals: TERMINATE_SIGNALS, - forwarded_signals: FORWARD_SIGNALS + forwarded_signals: FORWARD_SIGNALS, + synchronous: true ) metrics_server_pid = start_metrics_server @@ -136,7 +138,7 @@ def start_and_supervise_workers(queue_groups) # If we're not in the process of shutting down the cluster, # and the metrics server died, restart it. if supervisor.alive && dead_pids.include?(metrics_server_pid) - @logger.info('Metrics server terminated, restarting...') + @logger.info('Sidekiq metrics server terminated, restarting...') metrics_server_pid = restart_metrics_server(wipe_metrics_dir: false) all_pids = worker_pids + Array(metrics_server_pid) else diff --git a/sidekiq_cluster/sidekiq_cluster.rb b/sidekiq_cluster/sidekiq_cluster.rb index 3ba3211b0e42c6fe618e33b5c8863fe2669f2bcd..c68cbe7c16304ffff1ed91c695fe44a49bad4a24 100644 --- a/sidekiq_cluster/sidekiq_cluster.rb +++ b/sidekiq_cluster/sidekiq_cluster.rb @@ -16,6 +16,9 @@ module SidekiqCluster # before we kill the process. TIMEOUT_GRACE_PERIOD_SECONDS = 5 + # The singleton instance used to supervise cluster processes. + SidekiqProcessSupervisor = Class.new(Gitlab::ProcessSupervisor) + # Starts Sidekiq workers for the pairs of processes. # # Example: diff --git a/spec/commands/sidekiq_cluster/cli_spec.rb b/spec/commands/sidekiq_cluster/cli_spec.rb index 6baaa98eff9d419cebc36e3568bef18e2e3b3d91..2cb3f67b03d8bb1fda57e7bdc25bb8ea2332a426 100644 --- a/spec/commands/sidekiq_cluster/cli_spec.rb +++ b/spec/commands/sidekiq_cluster/cli_spec.rb @@ -40,6 +40,8 @@ } end + let(:supervisor) { instance_double(Gitlab::SidekiqCluster::SidekiqProcessSupervisor) } + before do stub_env('RAILS_ENV', 'test') @@ -47,8 +49,11 @@ config_file.close allow(::Settings).to receive(:source).and_return(config_file.path) - ::Settings.reload! + + allow(Gitlab::ProcessManagement).to receive(:write_pid) + allow(Gitlab::SidekiqCluster::SidekiqProcessSupervisor).to receive(:instance).and_return(supervisor) + allow(supervisor).to receive(:supervise) end after do @@ -63,11 +68,6 @@ end context 'with arguments' do - before do - allow(Gitlab::ProcessManagement).to receive(:write_pid) - allow_next_instance_of(Gitlab::ProcessSupervisor) { |it| allow(it).to receive(:supervise) } - end - it 'starts the Sidekiq workers' do expect(Gitlab::SidekiqCluster).to receive(:start) .with([['foo']], default_options) @@ -298,8 +298,6 @@ context 'without --dryrun' do before do allow(Gitlab::SidekiqCluster).to receive(:start).and_return([]) - allow(Gitlab::ProcessManagement).to receive(:write_pid) - allow_next_instance_of(Gitlab::ProcessSupervisor) { |it| allow(it).to receive(:supervise) } end context 'when there are no sidekiq_health_checks settings set' do @@ -429,14 +427,11 @@ before do allow(Gitlab::SidekiqCluster).to receive(:start).and_return(sidekiq_worker_pids) - allow(Gitlab::ProcessManagement).to receive(:write_pid) end it 'stops the entire process cluster if one of the workers has been terminated' do - allow_next_instance_of(Gitlab::ProcessSupervisor) do |it| - allow(it).to receive(:supervise).and_yield([2]) - end - + expect(supervisor).to receive(:alive).and_return(true) + expect(supervisor).to receive(:supervise).and_yield([2]) expect(MetricsServer).to receive(:fork).once.and_return(metrics_server_pid) expect(Gitlab::ProcessManagement).to receive(:signal_processes).with([42, 99], :TERM) @@ -445,11 +440,8 @@ context 'when the supervisor is alive' do it 'restarts the metrics server when it is down' do - allow_next_instance_of(Gitlab::ProcessSupervisor) do |it| - allow(it).to receive(:alive).and_return(true) - allow(it).to receive(:supervise).and_yield([metrics_server_pid]) - end - + expect(supervisor).to receive(:alive).and_return(true) + expect(supervisor).to receive(:supervise).and_yield([metrics_server_pid]) expect(MetricsServer).to receive(:fork).twice.and_return(metrics_server_pid) cli.run(%w(foo)) @@ -458,11 +450,8 @@ context 'when the supervisor is shutting down' do it 'does not restart the metrics server' do - allow_next_instance_of(Gitlab::ProcessSupervisor) do |it| - allow(it).to receive(:alive).and_return(false) - allow(it).to receive(:supervise).and_yield([metrics_server_pid]) - end - + expect(supervisor).to receive(:alive).and_return(false) + expect(supervisor).to receive(:supervise).and_yield([metrics_server_pid]) expect(MetricsServer).to receive(:fork).once.and_return(metrics_server_pid) cli.run(%w(foo)) diff --git a/spec/lib/gitlab/process_supervisor_spec.rb b/spec/lib/gitlab/process_supervisor_spec.rb index d264c77d5fb30406ccf7269f0e504074bfb75af4..60b127daddacd5f7a8c18d2ad87c71da1c2f41ec 100644 --- a/spec/lib/gitlab/process_supervisor_spec.rb +++ b/spec/lib/gitlab/process_supervisor_spec.rb @@ -6,7 +6,9 @@ let(:health_check_interval_seconds) { 0.1 } let(:check_terminate_interval_seconds) { 1 } let(:forwarded_signals) { [] } - let(:process_id) do + let(:process_ids) { [spawn_process, spawn_process] } + + def spawn_process Process.spawn('while true; do sleep 1; done').tap do |pid| Process.detach(pid) end @@ -22,54 +24,52 @@ end after do - if Gitlab::ProcessManagement.process_alive?(process_id) - Process.kill('KILL', process_id) + process_ids.each do |pid| + Process.kill('KILL', pid) + rescue Errno::ESRCH + # Ignore if a process wasn't actually alive. end end describe '#supervise' do - context 'while supervised process is alive' do + context 'while supervised processes are alive' do it 'does not invoke callback' do - expect(Gitlab::ProcessManagement.process_alive?(process_id)).to be(true) + expect(Gitlab::ProcessManagement.all_alive?(process_ids)).to be(true) pids_killed = [] - thread = Thread.new do - supervisor.supervise(process_id) do |dead_pids| - pids_killed = dead_pids - [] - end + supervisor.supervise(process_ids) do |dead_pids| + pids_killed = dead_pids + [] end # Wait several times the poll frequency of the supervisor. sleep health_check_interval_seconds * 10 - thread.terminate expect(pids_killed).to be_empty - expect(Gitlab::ProcessManagement.process_alive?(process_id)).to be(true) + expect(Gitlab::ProcessManagement.all_alive?(process_ids)).to be(true) end end - context 'when supervised process dies' do - it 'triggers callback with the dead PIDs' do - expect(Gitlab::ProcessManagement.process_alive?(process_id)).to be(true) + context 'when a supervised process dies' do + it 'triggers callback with the dead PIDs and adds new PIDs to supervised PIDs' do + expect(Gitlab::ProcessManagement.all_alive?(process_ids)).to be(true) pids_killed = [] - thread = Thread.new do - supervisor.supervise(process_id) do |dead_pids| - pids_killed = dead_pids - [] - end + supervisor.supervise(process_ids) do |dead_pids| + pids_killed = dead_pids + [42] # Fake starting a new process in place of the terminated one. end # Terminate the supervised process. - Process.kill('TERM', process_id) + Process.kill('TERM', process_ids.first) await_condition(sleep_sec: health_check_interval_seconds) do - pids_killed == [process_id] + pids_killed == [process_ids.first] end - thread.terminate - expect(Gitlab::ProcessManagement.process_alive?(process_id)).to be(false) + expect(Gitlab::ProcessManagement.process_alive?(process_ids.first)).to be(false) + expect(Gitlab::ProcessManagement.process_alive?(process_ids.last)).to be(true) + expect(supervisor.supervised_pids).to match_array([process_ids.last, 42]) end end @@ -78,7 +78,7 @@ allow(supervisor).to receive(:sleep) allow(Gitlab::ProcessManagement).to receive(:trap_signals) allow(Gitlab::ProcessManagement).to receive(:all_alive?).and_return(false) - allow(Gitlab::ProcessManagement).to receive(:signal_processes).with([process_id], anything) + allow(Gitlab::ProcessManagement).to receive(:signal_processes).with(process_ids, anything) end context 'termination signals' do @@ -87,21 +87,21 @@ allow(Gitlab::ProcessManagement).to receive(:any_alive?).and_return(false) expect(Gitlab::ProcessManagement).to receive(:trap_signals).ordered.with(%i(INT TERM)).and_yield(:TERM) - expect(Gitlab::ProcessManagement).to receive(:signal_processes).ordered.with([process_id], :TERM) + expect(Gitlab::ProcessManagement).to receive(:signal_processes).ordered.with(process_ids, :TERM) expect(supervisor).not_to receive(:sleep).with(check_terminate_interval_seconds) - supervisor.supervise(process_id) { [] } + supervisor.supervise(process_ids) { [] } end end context 'when TERM does not result in timely shutdown of processes' do it 'issues a KILL signal after the grace period expires' do expect(Gitlab::ProcessManagement).to receive(:trap_signals).with(%i(INT TERM)).and_yield(:TERM) - expect(Gitlab::ProcessManagement).to receive(:signal_processes).ordered.with([process_id], :TERM) + expect(Gitlab::ProcessManagement).to receive(:signal_processes).ordered.with(process_ids, :TERM) expect(supervisor).to receive(:sleep).ordered.with(check_terminate_interval_seconds).at_least(:once) - expect(Gitlab::ProcessManagement).to receive(:signal_processes).ordered.with([process_id], '-KILL') + expect(Gitlab::ProcessManagement).to receive(:signal_processes).ordered.with(process_ids, '-KILL') - supervisor.supervise(process_id) { [] } + supervisor.supervise(process_ids) { [] } end end end @@ -111,10 +111,53 @@ it 'forwards given signals to the observed processes' do expect(Gitlab::ProcessManagement).to receive(:trap_signals).with(%i(USR1)).and_yield(:USR1) - expect(Gitlab::ProcessManagement).to receive(:signal_processes).ordered.with([process_id], :USR1) + expect(Gitlab::ProcessManagement).to receive(:signal_processes).ordered.with(process_ids, :USR1) + + supervisor.supervise(process_ids) { [] } + end + end + end + end + + describe '#shutdown' do + context 'when supervisor is supervising processes' do + before do + supervisor.supervise(process_ids) + end + + context 'when supervisor is alive' do + it 'signals TERM then KILL to all supervised processes' do + expect(Gitlab::ProcessManagement).to receive(:signal_processes).ordered.with(process_ids, :TERM) + expect(Gitlab::ProcessManagement).to receive(:signal_processes).ordered.with(process_ids, '-KILL') - supervisor.supervise(process_id) { [] } + supervisor.shutdown end + + it 'stops the supervisor' do + expect { supervisor.shutdown }.to change { supervisor.alive }.from(true).to(false) + end + end + + context 'when supervisor has already shut down' do + before do + supervisor.shutdown + end + + it 'does nothing' do + expect(supervisor.alive).to be(false) + expect(Gitlab::ProcessManagement).not_to receive(:signal_processes) + + supervisor.shutdown + end + end + end + + context 'when supervisor never started' do + it 'does nothing' do + expect(supervisor.alive).to be(false) + expect(Gitlab::ProcessManagement).not_to receive(:signal_processes) + + supervisor.shutdown end end end diff --git a/spec/metrics_server/metrics_server_spec.rb b/spec/metrics_server/metrics_server_spec.rb index 860a3299d853c9727c737e2331aab1e676ee1537..591840dcba21e63101422450d664d1298eb456b1 100644 --- a/spec/metrics_server/metrics_server_spec.rb +++ b/spec/metrics_server/metrics_server_spec.rb @@ -1,13 +1,10 @@ # frozen_string_literal: true -require 'fast_spec_helper' +require 'spec_helper' require_relative '../../metrics_server/metrics_server' -require_relative '../support/helpers/next_instance_of' RSpec.describe MetricsServer do # rubocop:disable RSpec/FilePath - include NextInstanceOf - let(:prometheus_config) { ::Prometheus::Client.configuration } let(:metrics_dir) { Dir.mktmpdir } @@ -205,4 +202,47 @@ it_behaves_like 'a metrics exporter', 'sidekiq', 'sidekiq_exporter' end + + describe '.start_for_puma' do + let(:supervisor) { instance_double(Gitlab::ProcessSupervisor) } + + before do + allow(Gitlab::ProcessSupervisor).to receive(:instance).and_return(supervisor) + end + + it 'spawns a server process and supervises it' do + expect(Process).to receive(:spawn).with( + include('METRICS_SERVER_TARGET' => 'puma'), end_with('bin/metrics-server'), anything + ).once.and_return(42) + expect(supervisor).to receive(:supervise).with(42) + + described_class.start_for_puma + end + + context 'when the supervisor callback is invoked' do + context 'and the supervisor is alive' do + it 'restarts the metrics server' do + expect(supervisor).to receive(:alive).and_return(true) + expect(supervisor).to receive(:supervise).and_yield + expect(Process).to receive(:spawn).with( + include('METRICS_SERVER_TARGET' => 'puma'), end_with('bin/metrics-server'), anything + ).twice.and_return(42) + + described_class.start_for_puma + end + end + + context 'and the supervisor is not alive' do + it 'does not restart the server' do + expect(supervisor).to receive(:alive).and_return(false) + expect(supervisor).to receive(:supervise).and_yield + expect(Process).to receive(:spawn).with( + include('METRICS_SERVER_TARGET' => 'puma'), end_with('bin/metrics-server'), anything + ).once.and_return(42) + + described_class.start_for_puma + end + end + end + end end