diff --git a/config/initializers/7_prometheus_metrics.rb b/config/initializers/7_prometheus_metrics.rb index e4d47d5381539f7d8d809d060536cfc6452245da..0d874f5bebc26adb4d8b1b81d7b87df20829bca9 100644 --- a/config/initializers/7_prometheus_metrics.rb +++ b/config/initializers/7_prometheus_metrics.rb @@ -4,7 +4,7 @@ require Rails.root.join('metrics_server', 'metrics_server') if PUMA_EXTERNAL_METRICS_SERVER # Keep separate directories for separate processes -def prometheus_default_multiproc_dir +def metrics_temp_dir return unless Rails.env.development? || Rails.env.test? if Gitlab::Runtime.sidekiq? @@ -16,20 +16,22 @@ def prometheus_default_multiproc_dir end end -def puma_metrics_server_process? - Prometheus::PidProvider.worker_id == 'puma_master' +def prometheus_metrics_dir + ENV['prometheus_multiproc_dir'] || metrics_temp_dir end -def sidekiq_metrics_server_process? - Gitlab::Runtime.sidekiq? && (!ENV['SIDEKIQ_WORKER_ID'] || ENV['SIDEKIQ_WORKER_ID'] == '0') +def puma_metrics_server_process? + Prometheus::PidProvider.worker_id == 'puma_master' end -if puma_metrics_server_process? || sidekiq_metrics_server_process? +if puma_metrics_server_process? # The following is necessary to ensure stale Prometheus metrics don't accumulate over time. - # It needs to be done as early as here to ensure metrics files aren't deleted. - # After we hit our app in `warmup`, first metrics and corresponding files already being created, - # for example in `lib/gitlab/metrics/requests_rack_middleware.rb`. - Prometheus::CleanupMultiprocDirService.new.execute + # It needs to be done as early as possible to ensure new metrics aren't being deleted. + # + # Note that this should not happen for Sidekiq. Since Sidekiq workers are spawned from the + # sidekiq-cluster script, we perform this cleanup in `sidekiq_cluster/cli.rb` instead, + # since it must happen prior to any worker processes or the metrics server starting up. + Prometheus::CleanupMultiprocDirService.new(prometheus_metrics_dir).execute ::Prometheus::Client.reinitialize_on_pid_change(force: true) end @@ -37,7 +39,7 @@ def sidekiq_metrics_server_process? ::Prometheus::Client.configure do |config| config.logger = Gitlab::AppLogger - config.multiprocess_files_dir = ENV['prometheus_multiproc_dir'] || prometheus_default_multiproc_dir + config.multiprocess_files_dir = prometheus_metrics_dir config.pid_provider = ::Prometheus::PidProvider.method(:worker_id) end diff --git a/lib/prometheus/cleanup_multiproc_dir_service.rb b/lib/prometheus/cleanup_multiproc_dir_service.rb index 6418b4de166b0c8a86df67740b95f6e5c2db57e5..b309247fa73baff48266008f9dad520110e7eba8 100644 --- a/lib/prometheus/cleanup_multiproc_dir_service.rb +++ b/lib/prometheus/cleanup_multiproc_dir_service.rb @@ -2,22 +2,17 @@ module Prometheus class CleanupMultiprocDirService - include Gitlab::Utils::StrongMemoize - - def execute - FileUtils.rm_rf(old_metrics) if old_metrics + def initialize(metrics_dir) + @metrics_dir = metrics_dir end - private + def execute + return if @metrics_dir.blank? - def old_metrics - strong_memoize(:old_metrics) do - Dir[File.join(multiprocess_files_dir, '*.db')] if multiprocess_files_dir - end - end + files_to_delete = Dir[File.join(@metrics_dir, '*.db')] + return if files_to_delete.blank? - def multiprocess_files_dir - ::Prometheus::Client.configuration.multiprocess_files_dir + FileUtils.rm_rf(files_to_delete) end end end diff --git a/metrics_server/metrics_server.rb b/metrics_server/metrics_server.rb index 2aadc63d65d3a5820efccdecb680ee6d30b19a2b..695f3a65930a89502de43224bd20886c479b9d7d 100644 --- a/metrics_server/metrics_server.rb +++ b/metrics_server/metrics_server.rb @@ -84,7 +84,7 @@ def start end FileUtils.mkdir_p(@metrics_dir, mode: 0700) - ::Prometheus::CleanupMultiprocDirService.new.execute if @wipe_metrics_dir + ::Prometheus::CleanupMultiprocDirService.new(@metrics_dir).execute if @wipe_metrics_dir # We need to `warmup: true` since otherwise the sampler and exporter threads enter # a race where not all Prometheus db files will be visible to the exporter, resulting diff --git a/sidekiq_cluster/cli.rb b/sidekiq_cluster/cli.rb index e04f5ab1d42b082b2c253eebc438d6d9b539b99a..59f39003429edc968c09716325e5fa44bdffbb6c 100644 --- a/sidekiq_cluster/cli.rb +++ b/sidekiq_cluster/cli.rb @@ -118,6 +118,11 @@ def start_and_supervise_workers(queue_groups) return if @dryrun + # Make sure we reset the metrics directory prior to: + # - starting a metrics server process + # - starting new workers + ::Prometheus::CleanupMultiprocDirService.new(@metrics_dir).execute + ProcessManagement.write_pid(@pid) if @pid supervisor = SidekiqProcessSupervisor.instance( @@ -137,7 +142,7 @@ def start_and_supervise_workers(queue_groups) # and the metrics server died, restart it. if supervisor.alive && dead_pids.include?(metrics_server_pid) @logger.info('Sidekiq metrics server terminated, restarting...') - metrics_server_pid = restart_metrics_server(wipe_metrics_dir: false) + metrics_server_pid = restart_metrics_server all_pids = worker_pids + Array(metrics_server_pid) else # If a worker process died we'll just terminate the whole cluster. @@ -154,15 +159,14 @@ def start_and_supervise_workers(queue_groups) def start_metrics_server return unless metrics_server_enabled? - restart_metrics_server(wipe_metrics_dir: true) + restart_metrics_server end - def restart_metrics_server(wipe_metrics_dir: false) + def restart_metrics_server @logger.info("Starting metrics server on port #{sidekiq_exporter_port}") MetricsServer.fork( 'sidekiq', metrics_dir: @metrics_dir, - wipe_metrics_dir: wipe_metrics_dir, reset_signals: TERMINATE_SIGNALS + FORWARD_SIGNALS ) end diff --git a/spec/commands/sidekiq_cluster/cli_spec.rb b/spec/commands/sidekiq_cluster/cli_spec.rb index 2cb3f67b03d8bb1fda57e7bdc25bb8ea2332a426..e2fc907fd05df64b957dac0f5e10c9ccebda5dc7 100644 --- a/spec/commands/sidekiq_cluster/cli_spec.rb +++ b/spec/commands/sidekiq_cluster/cli_spec.rb @@ -41,6 +41,7 @@ end let(:supervisor) { instance_double(Gitlab::SidekiqCluster::SidekiqProcessSupervisor) } + let(:metrics_cleanup_service) { instance_double(Prometheus::CleanupMultiprocDirService, execute: nil) } before do stub_env('RAILS_ENV', 'test') @@ -54,6 +55,8 @@ allow(Gitlab::ProcessManagement).to receive(:write_pid) allow(Gitlab::SidekiqCluster::SidekiqProcessSupervisor).to receive(:instance).and_return(supervisor) allow(supervisor).to receive(:supervise) + + allow(Prometheus::CleanupMultiprocDirService).to receive(:new).and_return(metrics_cleanup_service) end after do @@ -300,6 +303,12 @@ allow(Gitlab::SidekiqCluster).to receive(:start).and_return([]) end + it 'wipes the metrics directory' do + expect(metrics_cleanup_service).to receive(:execute) + + cli.run(%w(foo)) + end + context 'when there are no sidekiq_health_checks settings set' do let(:sidekiq_exporter_enabled) { true } @@ -379,7 +388,7 @@ with_them do specify do if start_metrics_server - expect(MetricsServer).to receive(:fork).with('sidekiq', metrics_dir: metrics_dir, wipe_metrics_dir: true, reset_signals: trapped_signals) + expect(MetricsServer).to receive(:fork).with('sidekiq', metrics_dir: metrics_dir, reset_signals: trapped_signals) else expect(MetricsServer).not_to receive(:fork) end diff --git a/spec/lib/prometheus/cleanup_multiproc_dir_service_spec.rb b/spec/lib/prometheus/cleanup_multiproc_dir_service_spec.rb index db77a5d42d847766a228f9ea14360469e2d28508..bdf9673a53f9c465fc063ab1b5a746eefbbdf702 100644 --- a/spec/lib/prometheus/cleanup_multiproc_dir_service_spec.rb +++ b/spec/lib/prometheus/cleanup_multiproc_dir_service_spec.rb @@ -1,32 +1,27 @@ # frozen_string_literal: true -require 'spec_helper' +require 'fast_spec_helper' RSpec.describe Prometheus::CleanupMultiprocDirService do - describe '.call' do - subject { described_class.new.execute } - + describe '#execute' do let(:metrics_multiproc_dir) { Dir.mktmpdir } let(:metrics_file_path) { File.join(metrics_multiproc_dir, 'counter_puma_master-0.db') } + subject(:service) { described_class.new(metrics_dir_arg).execute } + before do FileUtils.touch(metrics_file_path) end after do - FileUtils.rm_r(metrics_multiproc_dir) + FileUtils.rm_rf(metrics_multiproc_dir) end context 'when `multiprocess_files_dir` is defined' do - before do - expect(Prometheus::Client.configuration) - .to receive(:multiprocess_files_dir) - .and_return(metrics_multiproc_dir) - .at_least(:once) - end + let(:metrics_dir_arg) { metrics_multiproc_dir } it 'removes old metrics' do - expect { subject } + expect { service } .to change { File.exist?(metrics_file_path) } .from(true) .to(false) @@ -34,15 +29,10 @@ end context 'when `multiprocess_files_dir` is not defined' do - before do - expect(Prometheus::Client.configuration) - .to receive(:multiprocess_files_dir) - .and_return(nil) - .at_least(:once) - end + let(:metrics_dir_arg) { nil } it 'does not remove any files' do - expect { subject } + expect { service } .not_to change { File.exist?(metrics_file_path) } .from(true) end