From 0e77c1819035d672160269a9bba966ce2bd6bac9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Matthias=20K=C3=A4ppler?= <mkaeppler@gitlab.com>
Date: Fri, 25 Mar 2022 09:40:30 +0000
Subject: [PATCH] Fix missing metrics for Sidekiq exporter server

We were accidentally deleting metrics the exporter
server exported about itself. Actual sidekiq worker
metrics were not affected.

This happened because we would wipe the metrics dir
whenever sidekiq_0 starts up, but this happens after
the metrics server starts, so the worker was deleting
those existing metrics.

Changelog: fixed
---
 config/initializers/7_prometheus_metrics.rb   | 24 ++++++++--------
 .../cleanup_multiproc_dir_service.rb          | 19 +++++--------
 metrics_server/metrics_server.rb              |  2 +-
 sidekiq_cluster/cli.rb                        | 12 +++++---
 spec/commands/sidekiq_cluster/cli_spec.rb     | 11 +++++++-
 .../cleanup_multiproc_dir_service_spec.rb     | 28 ++++++-------------
 6 files changed, 48 insertions(+), 48 deletions(-)

diff --git a/config/initializers/7_prometheus_metrics.rb b/config/initializers/7_prometheus_metrics.rb
index e4d47d5381539..0d874f5bebc26 100644
--- a/config/initializers/7_prometheus_metrics.rb
+++ b/config/initializers/7_prometheus_metrics.rb
@@ -4,7 +4,7 @@
 require Rails.root.join('metrics_server', 'metrics_server') if PUMA_EXTERNAL_METRICS_SERVER
 
 # Keep separate directories for separate processes
-def prometheus_default_multiproc_dir
+def metrics_temp_dir
   return unless Rails.env.development? || Rails.env.test?
 
   if Gitlab::Runtime.sidekiq?
@@ -16,20 +16,22 @@ def prometheus_default_multiproc_dir
   end
 end
 
-def puma_metrics_server_process?
-  Prometheus::PidProvider.worker_id == 'puma_master'
+def prometheus_metrics_dir
+  ENV['prometheus_multiproc_dir'] || metrics_temp_dir
 end
 
-def sidekiq_metrics_server_process?
-  Gitlab::Runtime.sidekiq? && (!ENV['SIDEKIQ_WORKER_ID'] || ENV['SIDEKIQ_WORKER_ID'] == '0')
+def puma_metrics_server_process?
+  Prometheus::PidProvider.worker_id == 'puma_master'
 end
 
-if puma_metrics_server_process? || sidekiq_metrics_server_process?
+if puma_metrics_server_process?
   # The following is necessary to ensure stale Prometheus metrics don't accumulate over time.
-  # It needs to be done as early as here to ensure metrics files aren't deleted.
-  # After we hit our app in `warmup`, first metrics and corresponding files already being created,
-  # for example in `lib/gitlab/metrics/requests_rack_middleware.rb`.
-  Prometheus::CleanupMultiprocDirService.new.execute
+  # It needs to be done as early as possible to ensure new metrics aren't being deleted.
+  #
+  # Note that this should not happen for Sidekiq. Since Sidekiq workers are spawned from the
+  # sidekiq-cluster script, we perform this cleanup in `sidekiq_cluster/cli.rb` instead,
+  # since it must happen prior to any worker processes or the metrics server starting up.
+  Prometheus::CleanupMultiprocDirService.new(prometheus_metrics_dir).execute
 
   ::Prometheus::Client.reinitialize_on_pid_change(force: true)
 end
@@ -37,7 +39,7 @@ def sidekiq_metrics_server_process?
 ::Prometheus::Client.configure do |config|
   config.logger = Gitlab::AppLogger
 
-  config.multiprocess_files_dir = ENV['prometheus_multiproc_dir'] || prometheus_default_multiproc_dir
+  config.multiprocess_files_dir = prometheus_metrics_dir
 
   config.pid_provider = ::Prometheus::PidProvider.method(:worker_id)
 end
diff --git a/lib/prometheus/cleanup_multiproc_dir_service.rb b/lib/prometheus/cleanup_multiproc_dir_service.rb
index 6418b4de166b0..b309247fa73ba 100644
--- a/lib/prometheus/cleanup_multiproc_dir_service.rb
+++ b/lib/prometheus/cleanup_multiproc_dir_service.rb
@@ -2,22 +2,17 @@
 
 module Prometheus
   class CleanupMultiprocDirService
-    include Gitlab::Utils::StrongMemoize
-
-    def execute
-      FileUtils.rm_rf(old_metrics) if old_metrics
+    def initialize(metrics_dir)
+      @metrics_dir = metrics_dir
     end
 
-    private
+    def execute
+      return if @metrics_dir.blank?
 
-    def old_metrics
-      strong_memoize(:old_metrics) do
-        Dir[File.join(multiprocess_files_dir, '*.db')] if multiprocess_files_dir
-      end
-    end
+      files_to_delete = Dir[File.join(@metrics_dir, '*.db')]
+      return if files_to_delete.blank?
 
-    def multiprocess_files_dir
-      ::Prometheus::Client.configuration.multiprocess_files_dir
+      FileUtils.rm_rf(files_to_delete)
     end
   end
 end
diff --git a/metrics_server/metrics_server.rb b/metrics_server/metrics_server.rb
index 2aadc63d65d3a..695f3a65930a8 100644
--- a/metrics_server/metrics_server.rb
+++ b/metrics_server/metrics_server.rb
@@ -84,7 +84,7 @@ def start
     end
 
     FileUtils.mkdir_p(@metrics_dir, mode: 0700)
-    ::Prometheus::CleanupMultiprocDirService.new.execute if @wipe_metrics_dir
+    ::Prometheus::CleanupMultiprocDirService.new(@metrics_dir).execute if @wipe_metrics_dir
 
     # We need to `warmup: true` since otherwise the sampler and exporter threads enter
     # a race where not all Prometheus db files will be visible to the exporter, resulting
diff --git a/sidekiq_cluster/cli.rb b/sidekiq_cluster/cli.rb
index e04f5ab1d42b0..59f39003429ed 100644
--- a/sidekiq_cluster/cli.rb
+++ b/sidekiq_cluster/cli.rb
@@ -118,6 +118,11 @@ def start_and_supervise_workers(queue_groups)
 
         return if @dryrun
 
+        # Make sure we reset the metrics directory prior to:
+        # - starting a metrics server process
+        # - starting new workers
+        ::Prometheus::CleanupMultiprocDirService.new(@metrics_dir).execute
+
         ProcessManagement.write_pid(@pid) if @pid
 
         supervisor = SidekiqProcessSupervisor.instance(
@@ -137,7 +142,7 @@ def start_and_supervise_workers(queue_groups)
           # and the metrics server died, restart it.
           if supervisor.alive && dead_pids.include?(metrics_server_pid)
             @logger.info('Sidekiq metrics server terminated, restarting...')
-            metrics_server_pid = restart_metrics_server(wipe_metrics_dir: false)
+            metrics_server_pid = restart_metrics_server
             all_pids = worker_pids + Array(metrics_server_pid)
           else
             # If a worker process died we'll just terminate the whole cluster.
@@ -154,15 +159,14 @@ def start_and_supervise_workers(queue_groups)
       def start_metrics_server
         return unless metrics_server_enabled?
 
-        restart_metrics_server(wipe_metrics_dir: true)
+        restart_metrics_server
       end
 
-      def restart_metrics_server(wipe_metrics_dir: false)
+      def restart_metrics_server
         @logger.info("Starting metrics server on port #{sidekiq_exporter_port}")
         MetricsServer.fork(
           'sidekiq',
           metrics_dir: @metrics_dir,
-          wipe_metrics_dir: wipe_metrics_dir,
           reset_signals: TERMINATE_SIGNALS + FORWARD_SIGNALS
         )
       end
diff --git a/spec/commands/sidekiq_cluster/cli_spec.rb b/spec/commands/sidekiq_cluster/cli_spec.rb
index 2cb3f67b03d8b..e2fc907fd05df 100644
--- a/spec/commands/sidekiq_cluster/cli_spec.rb
+++ b/spec/commands/sidekiq_cluster/cli_spec.rb
@@ -41,6 +41,7 @@
   end
 
   let(:supervisor) { instance_double(Gitlab::SidekiqCluster::SidekiqProcessSupervisor) }
+  let(:metrics_cleanup_service) { instance_double(Prometheus::CleanupMultiprocDirService, execute: nil) }
 
   before do
     stub_env('RAILS_ENV', 'test')
@@ -54,6 +55,8 @@
     allow(Gitlab::ProcessManagement).to receive(:write_pid)
     allow(Gitlab::SidekiqCluster::SidekiqProcessSupervisor).to receive(:instance).and_return(supervisor)
     allow(supervisor).to receive(:supervise)
+
+    allow(Prometheus::CleanupMultiprocDirService).to receive(:new).and_return(metrics_cleanup_service)
   end
 
   after do
@@ -300,6 +303,12 @@
             allow(Gitlab::SidekiqCluster).to receive(:start).and_return([])
           end
 
+          it 'wipes the metrics directory' do
+            expect(metrics_cleanup_service).to receive(:execute)
+
+            cli.run(%w(foo))
+          end
+
           context 'when there are no sidekiq_health_checks settings set' do
             let(:sidekiq_exporter_enabled) { true }
 
@@ -379,7 +388,7 @@
             with_them do
               specify do
                 if start_metrics_server
-                  expect(MetricsServer).to receive(:fork).with('sidekiq', metrics_dir: metrics_dir, wipe_metrics_dir: true, reset_signals: trapped_signals)
+                  expect(MetricsServer).to receive(:fork).with('sidekiq', metrics_dir: metrics_dir, reset_signals: trapped_signals)
                 else
                   expect(MetricsServer).not_to receive(:fork)
                 end
diff --git a/spec/lib/prometheus/cleanup_multiproc_dir_service_spec.rb b/spec/lib/prometheus/cleanup_multiproc_dir_service_spec.rb
index db77a5d42d847..bdf9673a53f9c 100644
--- a/spec/lib/prometheus/cleanup_multiproc_dir_service_spec.rb
+++ b/spec/lib/prometheus/cleanup_multiproc_dir_service_spec.rb
@@ -1,32 +1,27 @@
 # frozen_string_literal: true
 
-require 'spec_helper'
+require 'fast_spec_helper'
 
 RSpec.describe Prometheus::CleanupMultiprocDirService do
-  describe '.call' do
-    subject { described_class.new.execute }
-
+  describe '#execute' do
     let(:metrics_multiproc_dir) { Dir.mktmpdir }
     let(:metrics_file_path) { File.join(metrics_multiproc_dir, 'counter_puma_master-0.db') }
 
+    subject(:service) { described_class.new(metrics_dir_arg).execute }
+
     before do
       FileUtils.touch(metrics_file_path)
     end
 
     after do
-      FileUtils.rm_r(metrics_multiproc_dir)
+      FileUtils.rm_rf(metrics_multiproc_dir)
     end
 
     context 'when `multiprocess_files_dir` is defined' do
-      before do
-        expect(Prometheus::Client.configuration)
-              .to receive(:multiprocess_files_dir)
-              .and_return(metrics_multiproc_dir)
-              .at_least(:once)
-      end
+      let(:metrics_dir_arg) { metrics_multiproc_dir }
 
       it 'removes old metrics' do
-        expect { subject }
+        expect { service }
           .to change { File.exist?(metrics_file_path) }
           .from(true)
           .to(false)
@@ -34,15 +29,10 @@
     end
 
     context 'when `multiprocess_files_dir` is not defined' do
-      before do
-        expect(Prometheus::Client.configuration)
-              .to receive(:multiprocess_files_dir)
-              .and_return(nil)
-              .at_least(:once)
-      end
+      let(:metrics_dir_arg) { nil }
 
       it 'does not remove any files' do
-        expect { subject }
+        expect { service }
           .not_to change { File.exist?(metrics_file_path) }
           .from(true)
       end
-- 
GitLab