From 51c4ba46926a5a2fbf6276cbf7cc6a141453c534 Mon Sep 17 00:00:00 2001
From: Stan Hu <stanhu@gmail.com>
Date: Mon, 30 Dec 2019 23:47:48 -0800
Subject: [PATCH] Disable Prometheus metrics if initialization fails

Previously if the underlying filesystem ran of space, reads and writes
to mmap() regions would throw an ugly SIGBUS error and crash.

With prometheus-client-mmap v0.10.0,
`Prometheus::Client.reinitialize_on_pid_change` will now throw an
IOError if initialization fails for some reason. If this happens, we
disable internal Prometheus metrics to ensure the system stays up.

Closes https://gitlab.com/gitlab-org/gitlab/issues/24425
---
 Gemfile                                         |  2 +-
 Gemfile.lock                                    |  4 ++--
 .../sh-disable-prom-metrics-on-failure.yml      |  5 +++++
 config/initializers/7_prometheus_metrics.rb     |  6 ++++++
 lib/gitlab/metrics.rb                           |  6 ++++++
 lib/gitlab/metrics/prometheus.rb                | 10 +++++++++-
 spec/lib/gitlab/metrics/prometheus_spec.rb      | 17 +++++++++++++++++
 7 files changed, 46 insertions(+), 4 deletions(-)
 create mode 100644 changelogs/unreleased/sh-disable-prom-metrics-on-failure.yml

diff --git a/Gemfile b/Gemfile
index 5f3721e7663be..0c4f8d83782b7 100644
--- a/Gemfile
+++ b/Gemfile
@@ -327,7 +327,7 @@ group :metrics do
   gem 'influxdb', '~> 0.2', require: false
 
   # Prometheus
-  gem 'prometheus-client-mmap', '~> 0.9.10'
+  gem 'prometheus-client-mmap', '~> 0.10.0'
   gem 'raindrops', '~> 0.18'
 end
 
diff --git a/Gemfile.lock b/Gemfile.lock
index c5f3dc3e5a4a8..6a19d35774d80 100644
--- a/Gemfile.lock
+++ b/Gemfile.lock
@@ -749,7 +749,7 @@ GEM
       parser
       unparser
     procto (0.0.3)
-    prometheus-client-mmap (0.9.10)
+    prometheus-client-mmap (0.10.0)
     pry (0.11.3)
       coderay (~> 1.1.0)
       method_source (~> 0.9.0)
@@ -1292,7 +1292,7 @@ DEPENDENCIES
   pg (~> 1.1)
   png_quantizator (~> 0.2.1)
   premailer-rails (~> 1.10.3)
-  prometheus-client-mmap (~> 0.9.10)
+  prometheus-client-mmap (~> 0.10.0)
   pry-byebug (~> 3.5.1)
   pry-rails (~> 0.3.4)
   rack (~> 2.0.7)
diff --git a/changelogs/unreleased/sh-disable-prom-metrics-on-failure.yml b/changelogs/unreleased/sh-disable-prom-metrics-on-failure.yml
new file mode 100644
index 0000000000000..d9db2847d2ee2
--- /dev/null
+++ b/changelogs/unreleased/sh-disable-prom-metrics-on-failure.yml
@@ -0,0 +1,5 @@
+---
+title: Disable Prometheus metrics if initialization fails
+merge_request: 22355
+author:
+type: fixed
diff --git a/config/initializers/7_prometheus_metrics.rb b/config/initializers/7_prometheus_metrics.rb
index 22bb5f1764ddf..aa2601ea65037 100644
--- a/config/initializers/7_prometheus_metrics.rb
+++ b/config/initializers/7_prometheus_metrics.rb
@@ -43,6 +43,9 @@ def prometheus_default_multiproc_dir
     defined?(::Prometheus::Client.reinitialize_on_pid_change) && Prometheus::Client.reinitialize_on_pid_change
 
     Gitlab::Metrics::Samplers::RubySampler.initialize_instance(Settings.monitoring.ruby_sampler_interval).start
+  rescue IOError => e
+    Gitlab::ErrorTracking.track_exception(e)
+    Gitlab::Metrics.error_detected!
   end
 
   Gitlab::Cluster::LifecycleEvents.on_master_start do
@@ -55,6 +58,9 @@ def prometheus_default_multiproc_dir
     end
 
     Gitlab::Metrics::RequestsRackMiddleware.initialize_http_request_duration_seconds
+  rescue IOError => e
+    Gitlab::ErrorTracking.track_exception(e)
+    Gitlab::Metrics.error_detected!
   end
 end
 
diff --git a/lib/gitlab/metrics.rb b/lib/gitlab/metrics.rb
index 61ed20ad6236b..d759ae24051f4 100644
--- a/lib/gitlab/metrics.rb
+++ b/lib/gitlab/metrics.rb
@@ -5,8 +5,14 @@ module Metrics
     include Gitlab::Metrics::InfluxDb
     include Gitlab::Metrics::Prometheus
 
+    @error = false
+
     def self.enabled?
       influx_metrics_enabled? || prometheus_metrics_enabled?
     end
+
+    def self.error?
+      @error
+    end
   end
 end
diff --git a/lib/gitlab/metrics/prometheus.rb b/lib/gitlab/metrics/prometheus.rb
index cab1edab48fb3..f7480a8789e99 100644
--- a/lib/gitlab/metrics/prometheus.rb
+++ b/lib/gitlab/metrics/prometheus.rb
@@ -61,6 +61,14 @@ def histogram(name, docstring, base_labels = {}, buckets = ::Prometheus::Client:
           safe_provide_metric(:histogram, name, docstring, base_labels, buckets)
         end
 
+        def error_detected!
+          clear_memoization(:prometheus_metrics_enabled)
+
+          PROVIDER_MUTEX.synchronize do
+            @error = true
+          end
+        end
+
         private
 
         def safe_provide_metric(method, name, *args)
@@ -81,7 +89,7 @@ def provide_metric(name)
         end
 
         def prometheus_metrics_enabled_unmemoized
-          metrics_folder_present? && Gitlab::CurrentSettings.prometheus_metrics_enabled || false
+          !error? && metrics_folder_present? && Gitlab::CurrentSettings.prometheus_metrics_enabled || false
         end
       end
     end
diff --git a/spec/lib/gitlab/metrics/prometheus_spec.rb b/spec/lib/gitlab/metrics/prometheus_spec.rb
index b37624982e214..d4aa96a5b2028 100644
--- a/spec/lib/gitlab/metrics/prometheus_spec.rb
+++ b/spec/lib/gitlab/metrics/prometheus_spec.rb
@@ -17,4 +17,21 @@
       expect(all_metrics.registry.metrics.count).to eq(0)
     end
   end
+
+  describe '#error_detected!' do
+    before do
+      allow(all_metrics).to receive(:metrics_folder_present?).and_return(true)
+      stub_application_setting(prometheus_metrics_enabled: true)
+    end
+
+    it 'disables Prometheus metrics' do
+      expect(all_metrics.error?).to be_falsey
+      expect(all_metrics.prometheus_metrics_enabled?).to be_truthy
+
+      all_metrics.error_detected!
+
+      expect(all_metrics.prometheus_metrics_enabled?).to be_falsey
+      expect(all_metrics.error?).to be_truthy
+    end
+  end
 end
-- 
GitLab