From 51c4ba46926a5a2fbf6276cbf7cc6a141453c534 Mon Sep 17 00:00:00 2001 From: Stan Hu <stanhu@gmail.com> Date: Mon, 30 Dec 2019 23:47:48 -0800 Subject: [PATCH] Disable Prometheus metrics if initialization fails Previously if the underlying filesystem ran of space, reads and writes to mmap() regions would throw an ugly SIGBUS error and crash. With prometheus-client-mmap v0.10.0, `Prometheus::Client.reinitialize_on_pid_change` will now throw an IOError if initialization fails for some reason. If this happens, we disable internal Prometheus metrics to ensure the system stays up. Closes https://gitlab.com/gitlab-org/gitlab/issues/24425 --- Gemfile | 2 +- Gemfile.lock | 4 ++-- .../sh-disable-prom-metrics-on-failure.yml | 5 +++++ config/initializers/7_prometheus_metrics.rb | 6 ++++++ lib/gitlab/metrics.rb | 6 ++++++ lib/gitlab/metrics/prometheus.rb | 10 +++++++++- spec/lib/gitlab/metrics/prometheus_spec.rb | 17 +++++++++++++++++ 7 files changed, 46 insertions(+), 4 deletions(-) create mode 100644 changelogs/unreleased/sh-disable-prom-metrics-on-failure.yml diff --git a/Gemfile b/Gemfile index 5f3721e7663be..0c4f8d83782b7 100644 --- a/Gemfile +++ b/Gemfile @@ -327,7 +327,7 @@ group :metrics do gem 'influxdb', '~> 0.2', require: false # Prometheus - gem 'prometheus-client-mmap', '~> 0.9.10' + gem 'prometheus-client-mmap', '~> 0.10.0' gem 'raindrops', '~> 0.18' end diff --git a/Gemfile.lock b/Gemfile.lock index c5f3dc3e5a4a8..6a19d35774d80 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -749,7 +749,7 @@ GEM parser unparser procto (0.0.3) - prometheus-client-mmap (0.9.10) + prometheus-client-mmap (0.10.0) pry (0.11.3) coderay (~> 1.1.0) method_source (~> 0.9.0) @@ -1292,7 +1292,7 @@ DEPENDENCIES pg (~> 1.1) png_quantizator (~> 0.2.1) premailer-rails (~> 1.10.3) - prometheus-client-mmap (~> 0.9.10) + prometheus-client-mmap (~> 0.10.0) pry-byebug (~> 3.5.1) pry-rails (~> 0.3.4) rack (~> 2.0.7) diff --git a/changelogs/unreleased/sh-disable-prom-metrics-on-failure.yml b/changelogs/unreleased/sh-disable-prom-metrics-on-failure.yml new file mode 100644 index 0000000000000..d9db2847d2ee2 --- /dev/null +++ b/changelogs/unreleased/sh-disable-prom-metrics-on-failure.yml @@ -0,0 +1,5 @@ +--- +title: Disable Prometheus metrics if initialization fails +merge_request: 22355 +author: +type: fixed diff --git a/config/initializers/7_prometheus_metrics.rb b/config/initializers/7_prometheus_metrics.rb index 22bb5f1764ddf..aa2601ea65037 100644 --- a/config/initializers/7_prometheus_metrics.rb +++ b/config/initializers/7_prometheus_metrics.rb @@ -43,6 +43,9 @@ def prometheus_default_multiproc_dir defined?(::Prometheus::Client.reinitialize_on_pid_change) && Prometheus::Client.reinitialize_on_pid_change Gitlab::Metrics::Samplers::RubySampler.initialize_instance(Settings.monitoring.ruby_sampler_interval).start + rescue IOError => e + Gitlab::ErrorTracking.track_exception(e) + Gitlab::Metrics.error_detected! end Gitlab::Cluster::LifecycleEvents.on_master_start do @@ -55,6 +58,9 @@ def prometheus_default_multiproc_dir end Gitlab::Metrics::RequestsRackMiddleware.initialize_http_request_duration_seconds + rescue IOError => e + Gitlab::ErrorTracking.track_exception(e) + Gitlab::Metrics.error_detected! end end diff --git a/lib/gitlab/metrics.rb b/lib/gitlab/metrics.rb index 61ed20ad6236b..d759ae24051f4 100644 --- a/lib/gitlab/metrics.rb +++ b/lib/gitlab/metrics.rb @@ -5,8 +5,14 @@ module Metrics include Gitlab::Metrics::InfluxDb include Gitlab::Metrics::Prometheus + @error = false + def self.enabled? influx_metrics_enabled? || prometheus_metrics_enabled? end + + def self.error? + @error + end end end diff --git a/lib/gitlab/metrics/prometheus.rb b/lib/gitlab/metrics/prometheus.rb index cab1edab48fb3..f7480a8789e99 100644 --- a/lib/gitlab/metrics/prometheus.rb +++ b/lib/gitlab/metrics/prometheus.rb @@ -61,6 +61,14 @@ def histogram(name, docstring, base_labels = {}, buckets = ::Prometheus::Client: safe_provide_metric(:histogram, name, docstring, base_labels, buckets) end + def error_detected! + clear_memoization(:prometheus_metrics_enabled) + + PROVIDER_MUTEX.synchronize do + @error = true + end + end + private def safe_provide_metric(method, name, *args) @@ -81,7 +89,7 @@ def provide_metric(name) end def prometheus_metrics_enabled_unmemoized - metrics_folder_present? && Gitlab::CurrentSettings.prometheus_metrics_enabled || false + !error? && metrics_folder_present? && Gitlab::CurrentSettings.prometheus_metrics_enabled || false end end end diff --git a/spec/lib/gitlab/metrics/prometheus_spec.rb b/spec/lib/gitlab/metrics/prometheus_spec.rb index b37624982e214..d4aa96a5b2028 100644 --- a/spec/lib/gitlab/metrics/prometheus_spec.rb +++ b/spec/lib/gitlab/metrics/prometheus_spec.rb @@ -17,4 +17,21 @@ expect(all_metrics.registry.metrics.count).to eq(0) end end + + describe '#error_detected!' do + before do + allow(all_metrics).to receive(:metrics_folder_present?).and_return(true) + stub_application_setting(prometheus_metrics_enabled: true) + end + + it 'disables Prometheus metrics' do + expect(all_metrics.error?).to be_falsey + expect(all_metrics.prometheus_metrics_enabled?).to be_truthy + + all_metrics.error_detected! + + expect(all_metrics.prometheus_metrics_enabled?).to be_falsey + expect(all_metrics.error?).to be_truthy + end + end end -- GitLab