Skip to content
代码片段 群组 项目
提交 0e77c181 编辑于 作者: Matthias Käppler's avatar Matthias Käppler 提交者: Mark Chao
浏览文件

Fix missing metrics for Sidekiq exporter server

We were accidentally deleting metrics the exporter
server exported about itself. Actual sidekiq worker
metrics were not affected.

This happened because we would wipe the metrics dir
whenever sidekiq_0 starts up, but this happens after
the metrics server starts, so the worker was deleting
those existing metrics.

Changelog: fixed
上级 18cbe3da
No related branches found
No related tags found
无相关合并请求
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
require Rails.root.join('metrics_server', 'metrics_server') if PUMA_EXTERNAL_METRICS_SERVER require Rails.root.join('metrics_server', 'metrics_server') if PUMA_EXTERNAL_METRICS_SERVER
# Keep separate directories for separate processes # Keep separate directories for separate processes
def prometheus_default_multiproc_dir def metrics_temp_dir
return unless Rails.env.development? || Rails.env.test? return unless Rails.env.development? || Rails.env.test?
if Gitlab::Runtime.sidekiq? if Gitlab::Runtime.sidekiq?
...@@ -16,20 +16,22 @@ def prometheus_default_multiproc_dir ...@@ -16,20 +16,22 @@ def prometheus_default_multiproc_dir
end end
end end
def puma_metrics_server_process? def prometheus_metrics_dir
Prometheus::PidProvider.worker_id == 'puma_master' ENV['prometheus_multiproc_dir'] || metrics_temp_dir
end end
def sidekiq_metrics_server_process? def puma_metrics_server_process?
Gitlab::Runtime.sidekiq? && (!ENV['SIDEKIQ_WORKER_ID'] || ENV['SIDEKIQ_WORKER_ID'] == '0') Prometheus::PidProvider.worker_id == 'puma_master'
end end
if puma_metrics_server_process? || sidekiq_metrics_server_process? if puma_metrics_server_process?
# The following is necessary to ensure stale Prometheus metrics don't accumulate over time. # The following is necessary to ensure stale Prometheus metrics don't accumulate over time.
# It needs to be done as early as here to ensure metrics files aren't deleted. # It needs to be done as early as possible to ensure new metrics aren't being deleted.
# After we hit our app in `warmup`, first metrics and corresponding files already being created, #
# for example in `lib/gitlab/metrics/requests_rack_middleware.rb`. # Note that this should not happen for Sidekiq. Since Sidekiq workers are spawned from the
Prometheus::CleanupMultiprocDirService.new.execute # sidekiq-cluster script, we perform this cleanup in `sidekiq_cluster/cli.rb` instead,
# since it must happen prior to any worker processes or the metrics server starting up.
Prometheus::CleanupMultiprocDirService.new(prometheus_metrics_dir).execute
::Prometheus::Client.reinitialize_on_pid_change(force: true) ::Prometheus::Client.reinitialize_on_pid_change(force: true)
end end
...@@ -37,7 +39,7 @@ def sidekiq_metrics_server_process? ...@@ -37,7 +39,7 @@ def sidekiq_metrics_server_process?
::Prometheus::Client.configure do |config| ::Prometheus::Client.configure do |config|
config.logger = Gitlab::AppLogger config.logger = Gitlab::AppLogger
config.multiprocess_files_dir = ENV['prometheus_multiproc_dir'] || prometheus_default_multiproc_dir config.multiprocess_files_dir = prometheus_metrics_dir
config.pid_provider = ::Prometheus::PidProvider.method(:worker_id) config.pid_provider = ::Prometheus::PidProvider.method(:worker_id)
end end
......
...@@ -2,22 +2,17 @@ ...@@ -2,22 +2,17 @@
module Prometheus module Prometheus
class CleanupMultiprocDirService class CleanupMultiprocDirService
include Gitlab::Utils::StrongMemoize def initialize(metrics_dir)
@metrics_dir = metrics_dir
def execute
FileUtils.rm_rf(old_metrics) if old_metrics
end end
private def execute
return if @metrics_dir.blank?
def old_metrics files_to_delete = Dir[File.join(@metrics_dir, '*.db')]
strong_memoize(:old_metrics) do return if files_to_delete.blank?
Dir[File.join(multiprocess_files_dir, '*.db')] if multiprocess_files_dir
end
end
def multiprocess_files_dir FileUtils.rm_rf(files_to_delete)
::Prometheus::Client.configuration.multiprocess_files_dir
end end
end end
end end
...@@ -84,7 +84,7 @@ def start ...@@ -84,7 +84,7 @@ def start
end end
FileUtils.mkdir_p(@metrics_dir, mode: 0700) FileUtils.mkdir_p(@metrics_dir, mode: 0700)
::Prometheus::CleanupMultiprocDirService.new.execute if @wipe_metrics_dir ::Prometheus::CleanupMultiprocDirService.new(@metrics_dir).execute if @wipe_metrics_dir
# We need to `warmup: true` since otherwise the sampler and exporter threads enter # We need to `warmup: true` since otherwise the sampler and exporter threads enter
# a race where not all Prometheus db files will be visible to the exporter, resulting # a race where not all Prometheus db files will be visible to the exporter, resulting
......
...@@ -118,6 +118,11 @@ def start_and_supervise_workers(queue_groups) ...@@ -118,6 +118,11 @@ def start_and_supervise_workers(queue_groups)
return if @dryrun return if @dryrun
# Make sure we reset the metrics directory prior to:
# - starting a metrics server process
# - starting new workers
::Prometheus::CleanupMultiprocDirService.new(@metrics_dir).execute
ProcessManagement.write_pid(@pid) if @pid ProcessManagement.write_pid(@pid) if @pid
supervisor = SidekiqProcessSupervisor.instance( supervisor = SidekiqProcessSupervisor.instance(
...@@ -137,7 +142,7 @@ def start_and_supervise_workers(queue_groups) ...@@ -137,7 +142,7 @@ def start_and_supervise_workers(queue_groups)
# and the metrics server died, restart it. # and the metrics server died, restart it.
if supervisor.alive && dead_pids.include?(metrics_server_pid) if supervisor.alive && dead_pids.include?(metrics_server_pid)
@logger.info('Sidekiq metrics server terminated, restarting...') @logger.info('Sidekiq metrics server terminated, restarting...')
metrics_server_pid = restart_metrics_server(wipe_metrics_dir: false) metrics_server_pid = restart_metrics_server
all_pids = worker_pids + Array(metrics_server_pid) all_pids = worker_pids + Array(metrics_server_pid)
else else
# If a worker process died we'll just terminate the whole cluster. # If a worker process died we'll just terminate the whole cluster.
...@@ -154,15 +159,14 @@ def start_and_supervise_workers(queue_groups) ...@@ -154,15 +159,14 @@ def start_and_supervise_workers(queue_groups)
def start_metrics_server def start_metrics_server
return unless metrics_server_enabled? return unless metrics_server_enabled?
restart_metrics_server(wipe_metrics_dir: true) restart_metrics_server
end end
def restart_metrics_server(wipe_metrics_dir: false) def restart_metrics_server
@logger.info("Starting metrics server on port #{sidekiq_exporter_port}") @logger.info("Starting metrics server on port #{sidekiq_exporter_port}")
MetricsServer.fork( MetricsServer.fork(
'sidekiq', 'sidekiq',
metrics_dir: @metrics_dir, metrics_dir: @metrics_dir,
wipe_metrics_dir: wipe_metrics_dir,
reset_signals: TERMINATE_SIGNALS + FORWARD_SIGNALS reset_signals: TERMINATE_SIGNALS + FORWARD_SIGNALS
) )
end end
......
...@@ -41,6 +41,7 @@ ...@@ -41,6 +41,7 @@
end end
let(:supervisor) { instance_double(Gitlab::SidekiqCluster::SidekiqProcessSupervisor) } let(:supervisor) { instance_double(Gitlab::SidekiqCluster::SidekiqProcessSupervisor) }
let(:metrics_cleanup_service) { instance_double(Prometheus::CleanupMultiprocDirService, execute: nil) }
before do before do
stub_env('RAILS_ENV', 'test') stub_env('RAILS_ENV', 'test')
...@@ -54,6 +55,8 @@ ...@@ -54,6 +55,8 @@
allow(Gitlab::ProcessManagement).to receive(:write_pid) allow(Gitlab::ProcessManagement).to receive(:write_pid)
allow(Gitlab::SidekiqCluster::SidekiqProcessSupervisor).to receive(:instance).and_return(supervisor) allow(Gitlab::SidekiqCluster::SidekiqProcessSupervisor).to receive(:instance).and_return(supervisor)
allow(supervisor).to receive(:supervise) allow(supervisor).to receive(:supervise)
allow(Prometheus::CleanupMultiprocDirService).to receive(:new).and_return(metrics_cleanup_service)
end end
after do after do
...@@ -300,6 +303,12 @@ ...@@ -300,6 +303,12 @@
allow(Gitlab::SidekiqCluster).to receive(:start).and_return([]) allow(Gitlab::SidekiqCluster).to receive(:start).and_return([])
end end
it 'wipes the metrics directory' do
expect(metrics_cleanup_service).to receive(:execute)
cli.run(%w(foo))
end
context 'when there are no sidekiq_health_checks settings set' do context 'when there are no sidekiq_health_checks settings set' do
let(:sidekiq_exporter_enabled) { true } let(:sidekiq_exporter_enabled) { true }
...@@ -379,7 +388,7 @@ ...@@ -379,7 +388,7 @@
with_them do with_them do
specify do specify do
if start_metrics_server if start_metrics_server
expect(MetricsServer).to receive(:fork).with('sidekiq', metrics_dir: metrics_dir, wipe_metrics_dir: true, reset_signals: trapped_signals) expect(MetricsServer).to receive(:fork).with('sidekiq', metrics_dir: metrics_dir, reset_signals: trapped_signals)
else else
expect(MetricsServer).not_to receive(:fork) expect(MetricsServer).not_to receive(:fork)
end end
......
# frozen_string_literal: true # frozen_string_literal: true
require 'spec_helper' require 'fast_spec_helper'
RSpec.describe Prometheus::CleanupMultiprocDirService do RSpec.describe Prometheus::CleanupMultiprocDirService do
describe '.call' do describe '#execute' do
subject { described_class.new.execute }
let(:metrics_multiproc_dir) { Dir.mktmpdir } let(:metrics_multiproc_dir) { Dir.mktmpdir }
let(:metrics_file_path) { File.join(metrics_multiproc_dir, 'counter_puma_master-0.db') } let(:metrics_file_path) { File.join(metrics_multiproc_dir, 'counter_puma_master-0.db') }
subject(:service) { described_class.new(metrics_dir_arg).execute }
before do before do
FileUtils.touch(metrics_file_path) FileUtils.touch(metrics_file_path)
end end
after do after do
FileUtils.rm_r(metrics_multiproc_dir) FileUtils.rm_rf(metrics_multiproc_dir)
end end
context 'when `multiprocess_files_dir` is defined' do context 'when `multiprocess_files_dir` is defined' do
before do let(:metrics_dir_arg) { metrics_multiproc_dir }
expect(Prometheus::Client.configuration)
.to receive(:multiprocess_files_dir)
.and_return(metrics_multiproc_dir)
.at_least(:once)
end
it 'removes old metrics' do it 'removes old metrics' do
expect { subject } expect { service }
.to change { File.exist?(metrics_file_path) } .to change { File.exist?(metrics_file_path) }
.from(true) .from(true)
.to(false) .to(false)
...@@ -34,15 +29,10 @@ ...@@ -34,15 +29,10 @@
end end
context 'when `multiprocess_files_dir` is not defined' do context 'when `multiprocess_files_dir` is not defined' do
before do let(:metrics_dir_arg) { nil }
expect(Prometheus::Client.configuration)
.to receive(:multiprocess_files_dir)
.and_return(nil)
.at_least(:once)
end
it 'does not remove any files' do it 'does not remove any files' do
expect { subject } expect { service }
.not_to change { File.exist?(metrics_file_path) } .not_to change { File.exist?(metrics_file_path) }
.from(true) .from(true)
end end
......
0% 加载中 .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册