From 57e7e0b0a709d2d51e8af6a8053eb60de86d3d51 Mon Sep 17 00:00:00 2001
From: Matthias Kaeppler <mkaeppler@gitlab.com>
Date: Thu, 30 Jun 2022 14:04:20 +0200
Subject: [PATCH] Add heap fragmentation metric

We need to know to what extent the Ruby heap is
fragmented since this can lead to higher and sustained
memory use in production.

We add this metric to the existing RubySampler.

Changelog: added
---
 .../monitoring/prometheus/gitlab_metrics.md   |  1 +
 lib/gitlab/metrics/memory.rb                  | 15 +++++++
 lib/gitlab/metrics/samplers/ruby_sampler.rb   | 12 ++++--
 metrics_server/dependencies.rb                |  1 +
 spec/lib/gitlab/metrics/memory_spec.rb        | 40 +++++++++++++++++++
 .../metrics/samplers/ruby_sampler_spec.rb     |  6 +++
 6 files changed, 72 insertions(+), 3 deletions(-)
 create mode 100644 lib/gitlab/metrics/memory.rb
 create mode 100644 spec/lib/gitlab/metrics/memory_spec.rb

diff --git a/doc/administration/monitoring/prometheus/gitlab_metrics.md b/doc/administration/monitoring/prometheus/gitlab_metrics.md
index 4f8fbd0c07e22..b19b342c665b4 100644
--- a/doc/administration/monitoring/prometheus/gitlab_metrics.md
+++ b/doc/administration/monitoring/prometheus/gitlab_metrics.md
@@ -347,6 +347,7 @@ Some basic Ruby runtime metrics are available:
 |:---------------------------------------- |:--------- |:----- |:----------- |
 | `ruby_gc_duration_seconds`               | Counter   | 11.1  | Time spent by Ruby in GC |
 | `ruby_gc_stat_...`                       | Gauge     | 11.1  | Various metrics from [GC.stat](https://ruby-doc.org/core-2.6.5/GC.html#method-c-stat) |
+| `ruby_gc_stat_ext_heap_fragmentation`    | Gauge     | 15.2  | Degree of Ruby heap fragmentation as live objects versus eden slots (range 0 to 1) |
 | `ruby_file_descriptors`                  | Gauge     | 11.1  | File descriptors per process |
 | `ruby_sampler_duration_seconds`          | Counter   | 11.1  | Time spent collecting stats |
 | `ruby_process_cpu_seconds_total`         | Gauge     | 12.0  | Total amount of CPU time per process |
diff --git a/lib/gitlab/metrics/memory.rb b/lib/gitlab/metrics/memory.rb
new file mode 100644
index 0000000000000..c165cdec7a3dd
--- /dev/null
+++ b/lib/gitlab/metrics/memory.rb
@@ -0,0 +1,15 @@
+# frozen_string_literal: true
+
+module Gitlab
+  module Metrics
+    module Memory
+      extend self
+
+      HEAP_SLOTS_PER_PAGE = GC::INTERNAL_CONSTANTS[:HEAP_PAGE_OBJ_LIMIT]
+
+      def gc_heap_fragmentation(gc_stat = GC.stat)
+        1 - (gc_stat[:heap_live_slots] / (HEAP_SLOTS_PER_PAGE * gc_stat[:heap_eden_pages].to_f))
+      end
+    end
+  end
+end
diff --git a/lib/gitlab/metrics/samplers/ruby_sampler.rb b/lib/gitlab/metrics/samplers/ruby_sampler.rb
index 4a3ef3711a508..8e0022933471c 100644
--- a/lib/gitlab/metrics/samplers/ruby_sampler.rb
+++ b/lib/gitlab/metrics/samplers/ruby_sampler.rb
@@ -39,7 +39,8 @@ def init_metrics
             process_proportional_memory_bytes: ::Gitlab::Metrics.gauge(metric_name(:process, :proportional_memory_bytes), 'Memory used (PSS)', labels),
             process_start_time_seconds:        ::Gitlab::Metrics.gauge(metric_name(:process, :start_time_seconds), 'Process start time seconds'),
             sampler_duration:                  ::Gitlab::Metrics.counter(metric_name(:sampler, :duration_seconds_total), 'Sampler time', labels),
-            gc_duration_seconds:               ::Gitlab::Metrics.histogram(metric_name(:gc, :duration_seconds), 'GC time', labels, GC_REPORT_BUCKETS)
+            gc_duration_seconds:               ::Gitlab::Metrics.histogram(metric_name(:gc, :duration_seconds), 'GC time', labels, GC_REPORT_BUCKETS),
+            heap_fragmentation:                ::Gitlab::Metrics.gauge(metric_name(:gc_stat_ext, :heap_fragmentation), 'Ruby heap fragmentation', labels)
           }
 
           GC.stat.keys.each do |key|
@@ -76,8 +77,13 @@ def sample_gc
           end
 
           # Collect generic GC stats
-          GC.stat.each do |key, value|
-            metrics[key].set(labels, value)
+          GC.stat.then do |gc_stat|
+            gc_stat.each do |key, value|
+              metrics[key].set(labels, value)
+            end
+
+            # Collect custom GC stats
+            metrics[:heap_fragmentation].set(labels, Memory.gc_heap_fragmentation(gc_stat))
           end
         end
 
diff --git a/metrics_server/dependencies.rb b/metrics_server/dependencies.rb
index 3f188658ba298..233511eb505e4 100644
--- a/metrics_server/dependencies.rb
+++ b/metrics_server/dependencies.rb
@@ -20,6 +20,7 @@
 require_relative '../lib/gitlab/metrics/prometheus'
 require_relative '../lib/gitlab/metrics'
 require_relative '../lib/gitlab/metrics/system'
+require_relative '../lib/gitlab/metrics/memory'
 require_relative '../lib/gitlab/metrics/samplers/base_sampler'
 require_relative '../lib/gitlab/metrics/samplers/ruby_sampler'
 require_relative '../lib/gitlab/metrics/exporter/base_exporter'
diff --git a/spec/lib/gitlab/metrics/memory_spec.rb b/spec/lib/gitlab/metrics/memory_spec.rb
new file mode 100644
index 0000000000000..fd8ca3b37c634
--- /dev/null
+++ b/spec/lib/gitlab/metrics/memory_spec.rb
@@ -0,0 +1,40 @@
+# frozen_string_literal: true
+
+require 'fast_spec_helper'
+
+RSpec.describe Gitlab::Metrics::Memory do
+  describe '.gc_heap_fragmentation' do
+    subject(:call) do
+      described_class.gc_heap_fragmentation(
+        heap_live_slots: gc_stat_heap_live_slots,
+        heap_eden_pages: gc_stat_heap_eden_pages
+      )
+    end
+
+    context 'when the Ruby heap is perfectly utilized' do
+      # All objects are located in a single heap page.
+      let(:gc_stat_heap_live_slots) { described_class::HEAP_SLOTS_PER_PAGE }
+      let(:gc_stat_heap_eden_pages) { 1 }
+
+      it { is_expected.to eq(0) }
+    end
+
+    context 'when the Ruby heap is greatly fragmented' do
+      # There is one object per heap page.
+      let(:gc_stat_heap_live_slots) { described_class::HEAP_SLOTS_PER_PAGE }
+      let(:gc_stat_heap_eden_pages) { described_class::HEAP_SLOTS_PER_PAGE }
+
+      # The heap can never be "perfectly fragmented" because that would require
+      # zero objects per page.
+      it { is_expected.to be > 0.99 }
+    end
+
+    context 'when the Ruby heap is semi-fragmented' do
+      # All objects are spread over two pages i.e. each page is 50% utilized.
+      let(:gc_stat_heap_live_slots) { described_class::HEAP_SLOTS_PER_PAGE }
+      let(:gc_stat_heap_eden_pages) { 2 }
+
+      it { is_expected.to eq(0.5) }
+    end
+  end
+end
diff --git a/spec/lib/gitlab/metrics/samplers/ruby_sampler_spec.rb b/spec/lib/gitlab/metrics/samplers/ruby_sampler_spec.rb
index dfae5aa678450..b1566ffa7b460 100644
--- a/spec/lib/gitlab/metrics/samplers/ruby_sampler_spec.rb
+++ b/spec/lib/gitlab/metrics/samplers/ruby_sampler_spec.rb
@@ -125,5 +125,11 @@
 
       sampler.sample
     end
+
+    it 'adds a heap fragmentation metric' do
+      expect(sampler.metrics[:heap_fragmentation]).to receive(:set).with({}, anything)
+
+      sampler.sample
+    end
   end
 end
-- 
GitLab